In [1]:
import pandas as pd,os
In [2]:
path="E:\\Projects\\Jigsaw\\Capstone"
os.chdir(path)
In [3]:
os.getcwd()
Out[3]:
'E:\\Projects\\Jigsaw\\Capstone'
In [4]:
df=pd.read_excel('Student Applications & Performance.xlsx')
In [5]:
df.shape
Out[5]:
(3400, 56)
In [6]:
df.isnull().sum()
Out[6]:
STUDENT IDENTIFIER             0
STDNT_AGE                      0
STDNT_GENDER                   0
STDNT_BACKGROUND               0
IN_STATE_FLAG                  0
INTERNATIONAL_STS              0
STDNT_MAJOR                    0
STDNT_MINOR                    0
STDNT_TEST_ENTRANCE1        2294
STDNT_TEST_ENTRANCE2         908
STDNT_TEST_ENTRANCE_COMB     518
FIRST_TERM                     0
CORE_COURSE_NAME_1_F           0
CORE_COURSE_GRADE_1_F          0
CORE_COURSE_NAME_2_F          99
CORE_COURSE_GRADE_2_F         99
CORE_COURSE_NAME_3_F         565
CORE_COURSE_GRADE_3_F        565
CORE_COURSE_NAME_4_F        1597
CORE_COURSE_GRADE_4_F       1597
CORE_COURSE_NAME_5_F        2755
CORE_COURSE_GRADE_5_F       2755
CORE_COURSE_NAME_6_F        3272
CORE_COURSE_GRADE_6_F       3272
SECOND_TERM                    0
CORE_COURSE_NAME_1_S         157
CORE_COURSE_GRADE_1_S        232
CORE_COURSE_NAME_2_S         439
CORE_COURSE_GRADE_2_S        439
CORE_COURSE_NAME_3_S        1038
CORE_COURSE_GRADE_3_S       1038
CORE_COURSE_NAME_4_S        2045
CORE_COURSE_GRADE_4_S       2045
CORE_COURSE_NAME_5_S        2950
CORE_COURSE_GRADE_5_S       2950
CORE_COURSE_NAME_6_S        3319
CORE_COURSE_GRADE_6_S       3319
HOUSING_STS                    0
RETURNED_2ND_YR                0
DISTANCE_FROM_HOME            25
HIGH_SCHL_GPA                 53
HIGH_SCHL_NAME                 1
FATHER_HI_EDU_CD             432
FATHER_HI_EDU_DESC             0
MOTHER_HI_EDU_CD             489
MOTHER_HI_EDU_DESC             0
DEGREE_GROUP_CD                0
DEGREE_GROUP_DESC              0
FIRST_TERM_ATTEMPT_HRS         0
FIRST_TERM_EARNED_HRS          0
SECOND_TERM_ATTEMPT_HRS      206
SECOND_TERM_EARNED_HRS       209
GROSS_FIN_NEED                 0
COST_OF_ATTEND                 0
EST_FAM_CONTRIBUTION           0
UNMET_NEED                     0
dtype: int64
In [7]:
df_remove=pd.DataFrame(round((df.isnull().sum()/df.shape[0])*100,2).sort_values(ascending=False)).reset_index()
In [8]:
df_remove
Out[8]:
index 0
0 CORE_COURSE_NAME_6_S 97.62
1 CORE_COURSE_GRADE_6_S 97.62
2 CORE_COURSE_NAME_6_F 96.24
3 CORE_COURSE_GRADE_6_F 96.24
4 CORE_COURSE_NAME_5_S 86.76
5 CORE_COURSE_GRADE_5_S 86.76
6 CORE_COURSE_NAME_5_F 81.03
7 CORE_COURSE_GRADE_5_F 81.03
8 STDNT_TEST_ENTRANCE1 67.47
9 CORE_COURSE_NAME_4_S 60.15
10 CORE_COURSE_GRADE_4_S 60.15
11 CORE_COURSE_GRADE_4_F 46.97
12 CORE_COURSE_NAME_4_F 46.97
13 CORE_COURSE_NAME_3_S 30.53
14 CORE_COURSE_GRADE_3_S 30.53
15 STDNT_TEST_ENTRANCE2 26.71
16 CORE_COURSE_GRADE_3_F 16.62
17 CORE_COURSE_NAME_3_F 16.62
18 STDNT_TEST_ENTRANCE_COMB 15.24
19 MOTHER_HI_EDU_CD 14.38
20 CORE_COURSE_GRADE_2_S 12.91
21 CORE_COURSE_NAME_2_S 12.91
22 FATHER_HI_EDU_CD 12.71
23 CORE_COURSE_GRADE_1_S 6.82
24 SECOND_TERM_EARNED_HRS 6.15
25 SECOND_TERM_ATTEMPT_HRS 6.06
26 CORE_COURSE_NAME_1_S 4.62
27 CORE_COURSE_NAME_2_F 2.91
28 CORE_COURSE_GRADE_2_F 2.91
29 HIGH_SCHL_GPA 1.56
30 DISTANCE_FROM_HOME 0.74
31 HIGH_SCHL_NAME 0.03
32 STDNT_BACKGROUND 0.00
33 STDNT_AGE 0.00
34 STDNT_GENDER 0.00
35 CORE_COURSE_GRADE_1_F 0.00
36 IN_STATE_FLAG 0.00
37 INTERNATIONAL_STS 0.00
38 CORE_COURSE_NAME_1_F 0.00
39 FIRST_TERM 0.00
40 STDNT_MINOR 0.00
41 STDNT_MAJOR 0.00
42 UNMET_NEED 0.00
43 SECOND_TERM 0.00
44 EST_FAM_CONTRIBUTION 0.00
45 HOUSING_STS 0.00
46 RETURNED_2ND_YR 0.00
47 FATHER_HI_EDU_DESC 0.00
48 MOTHER_HI_EDU_DESC 0.00
49 DEGREE_GROUP_CD 0.00
50 DEGREE_GROUP_DESC 0.00
51 FIRST_TERM_ATTEMPT_HRS 0.00
52 FIRST_TERM_EARNED_HRS 0.00
53 GROSS_FIN_NEED 0.00
54 COST_OF_ATTEND 0.00
55 STUDENT IDENTIFIER 0.00
In [9]:
df_remove=pd.DataFrame(round((df.isnull().sum()/df.shape[0]*100)).nlargest(16)).reset_index()['index'].sort_values(ascending=False)
In [10]:
kp=df.drop(columns=df_remove.tolist(),axis=1,inplace=True)
In [11]:
df.shape
Out[11]:
(3400, 40)
In [12]:
df.columns
Out[12]:
Index(['STUDENT IDENTIFIER', 'STDNT_AGE', 'STDNT_GENDER', 'STDNT_BACKGROUND',
       'IN_STATE_FLAG', 'INTERNATIONAL_STS', 'STDNT_MAJOR', 'STDNT_MINOR',
       'STDNT_TEST_ENTRANCE_COMB', 'FIRST_TERM', 'CORE_COURSE_NAME_1_F',
       'CORE_COURSE_GRADE_1_F', 'CORE_COURSE_NAME_2_F',
       'CORE_COURSE_GRADE_2_F', 'CORE_COURSE_NAME_3_F',
       'CORE_COURSE_GRADE_3_F', 'SECOND_TERM', 'CORE_COURSE_NAME_1_S',
       'CORE_COURSE_GRADE_1_S', 'CORE_COURSE_NAME_2_S',
       'CORE_COURSE_GRADE_2_S', 'HOUSING_STS', 'RETURNED_2ND_YR',
       'DISTANCE_FROM_HOME', 'HIGH_SCHL_GPA', 'HIGH_SCHL_NAME',
       'FATHER_HI_EDU_CD', 'FATHER_HI_EDU_DESC', 'MOTHER_HI_EDU_CD',
       'MOTHER_HI_EDU_DESC', 'DEGREE_GROUP_CD', 'DEGREE_GROUP_DESC',
       'FIRST_TERM_ATTEMPT_HRS', 'FIRST_TERM_EARNED_HRS',
       'SECOND_TERM_ATTEMPT_HRS', 'SECOND_TERM_EARNED_HRS', 'GROSS_FIN_NEED',
       'COST_OF_ATTEND', 'EST_FAM_CONTRIBUTION', 'UNMET_NEED'],
      dtype='object')
In [13]:
df.isnull().sum()
Out[13]:
STUDENT IDENTIFIER            0
STDNT_AGE                     0
STDNT_GENDER                  0
STDNT_BACKGROUND              0
IN_STATE_FLAG                 0
INTERNATIONAL_STS             0
STDNT_MAJOR                   0
STDNT_MINOR                   0
STDNT_TEST_ENTRANCE_COMB    518
FIRST_TERM                    0
CORE_COURSE_NAME_1_F          0
CORE_COURSE_GRADE_1_F         0
CORE_COURSE_NAME_2_F         99
CORE_COURSE_GRADE_2_F        99
CORE_COURSE_NAME_3_F        565
CORE_COURSE_GRADE_3_F       565
SECOND_TERM                   0
CORE_COURSE_NAME_1_S        157
CORE_COURSE_GRADE_1_S       232
CORE_COURSE_NAME_2_S        439
CORE_COURSE_GRADE_2_S       439
HOUSING_STS                   0
RETURNED_2ND_YR               0
DISTANCE_FROM_HOME           25
HIGH_SCHL_GPA                53
HIGH_SCHL_NAME                1
FATHER_HI_EDU_CD            432
FATHER_HI_EDU_DESC            0
MOTHER_HI_EDU_CD            489
MOTHER_HI_EDU_DESC            0
DEGREE_GROUP_CD               0
DEGREE_GROUP_DESC             0
FIRST_TERM_ATTEMPT_HRS        0
FIRST_TERM_EARNED_HRS         0
SECOND_TERM_ATTEMPT_HRS     206
SECOND_TERM_EARNED_HRS      209
GROSS_FIN_NEED                0
COST_OF_ATTEND                0
EST_FAM_CONTRIBUTION          0
UNMET_NEED                    0
dtype: int64
In [14]:
df.columns.to_series().groupby(df.dtypes).groups
Out[14]:
{dtype('int64'): Index(['STUDENT IDENTIFIER', 'STDNT_AGE', 'FIRST_TERM', 'SECOND_TERM',
        'RETURNED_2ND_YR', 'FIRST_TERM_ATTEMPT_HRS', 'FIRST_TERM_EARNED_HRS',
        'GROSS_FIN_NEED', 'COST_OF_ATTEND', 'EST_FAM_CONTRIBUTION'],
       dtype='object'),
 dtype('float64'): Index(['STDNT_TEST_ENTRANCE_COMB', 'DISTANCE_FROM_HOME', 'HIGH_SCHL_GPA',
        'FATHER_HI_EDU_CD', 'MOTHER_HI_EDU_CD', 'SECOND_TERM_ATTEMPT_HRS',
        'SECOND_TERM_EARNED_HRS', 'UNMET_NEED'],
       dtype='object'),
 dtype('O'): Index(['STDNT_GENDER', 'STDNT_BACKGROUND', 'IN_STATE_FLAG',
        'INTERNATIONAL_STS', 'STDNT_MAJOR', 'STDNT_MINOR',
        'CORE_COURSE_NAME_1_F', 'CORE_COURSE_GRADE_1_F', 'CORE_COURSE_NAME_2_F',
        'CORE_COURSE_GRADE_2_F', 'CORE_COURSE_NAME_3_F',
        'CORE_COURSE_GRADE_3_F', 'CORE_COURSE_NAME_1_S',
        'CORE_COURSE_GRADE_1_S', 'CORE_COURSE_NAME_2_S',
        'CORE_COURSE_GRADE_2_S', 'HOUSING_STS', 'HIGH_SCHL_NAME',
        'FATHER_HI_EDU_DESC', 'MOTHER_HI_EDU_DESC', 'DEGREE_GROUP_CD',
        'DEGREE_GROUP_DESC'],
       dtype='object')}
In [15]:
# Dropping the columns which not required
# Dropping of columns
df.drop(columns=['STUDENT IDENTIFIER','FATHER_HI_EDU_DESC','MOTHER_HI_EDU_DESC','DEGREE_GROUP_DESC','FIRST_TERM','SECOND_TERM'],axis=1,inplace=True)
In [16]:
df.shape
Out[16]:
(3400, 34)
In [17]:
df.head()
Out[17]:
STDNT_AGE STDNT_GENDER STDNT_BACKGROUND IN_STATE_FLAG INTERNATIONAL_STS STDNT_MAJOR STDNT_MINOR STDNT_TEST_ENTRANCE_COMB CORE_COURSE_NAME_1_F CORE_COURSE_GRADE_1_F ... MOTHER_HI_EDU_CD DEGREE_GROUP_CD FIRST_TERM_ATTEMPT_HRS FIRST_TERM_EARNED_HRS SECOND_TERM_ATTEMPT_HRS SECOND_TERM_EARNED_HRS GROSS_FIN_NEED COST_OF_ATTEND EST_FAM_CONTRIBUTION UNMET_NEED
0 18 F BGD 1 Y N Undeclared N 1150.0 ANTH 1105 A ... NaN B 16 16 14.0 14.0 0 0 0 0.0
1 19 F BGD 1 N N Undeclared N 1190.0 ANTH 1105 A ... 3.0 B 18 18 18.0 18.0 570000 1355760 785760 459300.0
2 18 M BGD 1 Y N Mathematics N 1030.0 ANTH 1105 A ... 4.0 B 15 15 14.0 14.0 0 0 0 0.0
3 18 M BGD 1 Y N Undeclared N 1220.0 ANTH 1107 A ... 3.0 B 13 13 14.0 14.0 0 0 0 0.0
4 18 F BGD 1 Y N Art N 1190.0 ANTH 1107 A ... 2.0 B 12 12 12.0 12.0 835920 1355760 519840 278340.0

5 rows × 34 columns

In [18]:
df['FATHER_HI_EDU_CD']=df['FATHER_HI_EDU_CD'].fillna(df['FATHER_HI_EDU_CD'].value_counts().keys()[0])
df['FATHER_HI_EDU_CD'].value_counts().keys()
Out[18]:
Float64Index([2.0, 3.0, 4.0, 1.0], dtype='float64')
In [19]:
df['MOTHER_HI_EDU_CD']=df['MOTHER_HI_EDU_CD'].fillna(df['MOTHER_HI_EDU_CD'].value_counts().keys()[0])
In [20]:
df['SECOND_TERM_ATTEMPT_HRS']=df['SECOND_TERM_ATTEMPT_HRS'].fillna(df['SECOND_TERM_ATTEMPT_HRS'].mean())
df['SECOND_TERM_EARNED_HRS']=df['SECOND_TERM_EARNED_HRS'].fillna(df['SECOND_TERM_EARNED_HRS'].mean())
df['STDNT_TEST_ENTRANCE_COMB'].fillna(df['STDNT_TEST_ENTRANCE_COMB'].mean(),inplace=True)
df['DISTANCE_FROM_HOME'].fillna(df['DISTANCE_FROM_HOME'].mean(),inplace=True)
df['HIGH_SCHL_GPA'].fillna(df['HIGH_SCHL_GPA'].mean(),inplace=True)
df['HIGH_SCHL_NAME'].fillna(df['HIGH_SCHL_NAME'].value_counts().keys()[0],inplace=True)
In [21]:
df['CORE_COURSE_NAME_2_F'].fillna("ENGL 1101",inplace=True)
df['CORE_COURSE_GRADE_2_F'].fillna("B",inplace=True)
df['CORE_COURSE_NAME_3_F'].fillna("ENGL 1101",inplace=True)
df['CORE_COURSE_GRADE_3_F'].fillna("B",inplace=True)
df['CORE_COURSE_NAME_1_S'].fillna("ENGL 1102",inplace=True)
df['CORE_COURSE_GRADE_1_S'].fillna("C",inplace=True)
df['CORE_COURSE_NAME_2_S'].fillna("ENGL 1102",inplace=True)
df['CORE_COURSE_GRADE_2_S'].fillna("B",inplace=True)
In [22]:
#Replacing Incompl with Not Rep for CORE_COURSE_GRADE_1_F
df['CORE_COURSE_GRADE_1_F'].replace('INCOMPL','NOT REP',inplace=True)
#Replacing Incompl with Not Rep for CORE_COURSE_GRADE_1_F and 1_S
df['CORE_COURSE_GRADE_2_F'].replace('INCOMPL','NOT REP',inplace=True)
df['CORE_COURSE_GRADE_1_S'].replace('INCOMPL','NOT REP',inplace=True)
In [23]:
df['CORE_COURSE_GRADE_1_S'].unique()
Out[23]:
array(['A', 'B', 'C', 'D', 'F', 'NOT REP', 'Unknown'], dtype=object)
In [24]:
df['CORE_COURSE_GRADE_1_F'].unique()
Out[24]:
array(['A', 'B', 'C', 'NOT REP', 'D', 'F'], dtype=object)
In [25]:
#Replacing the School names
df['HIGH_SCHL_NAME']=df['HIGH_SCHL_NAME'].str.replace("SCHOOL","")
In [26]:
df['HIGH_SCHL_NAME']=df['HIGH_SCHL_NAME'].astype('int64')
In [27]:
#Replacing the student background
df['STDNT_BACKGROUND']=df['STDNT_BACKGROUND'].str.replace("BGD","")
In [28]:
df['STDNT_BACKGROUND']=df['STDNT_BACKGROUND'].astype('int64')
df['FATHER_HI_EDU_CD']=df['FATHER_HI_EDU_CD'].astype('int64')
In [29]:
#encoding the variables.
#for unmet needs we shall replace the positive numbers as 1, negative numbers as 2 and for other numbers as 0
df['UNMET_NEED'][df['UNMET_NEED']>0]=1
df['UNMET_NEED'][df['UNMET_NEED']<0]=2
df['UNMET_NEED'][df['UNMET_NEED']==0]=0
D:\Users\welcome\Anaconda3\lib\site-packages\ipykernel_launcher.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
D:\Users\welcome\Anaconda3\lib\site-packages\ipykernel_launcher.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
D:\Users\welcome\Anaconda3\lib\site-packages\ipykernel_launcher.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
In [30]:
df.isnull().sum()
Out[30]:
STDNT_AGE                   0
STDNT_GENDER                0
STDNT_BACKGROUND            0
IN_STATE_FLAG               0
INTERNATIONAL_STS           0
STDNT_MAJOR                 0
STDNT_MINOR                 0
STDNT_TEST_ENTRANCE_COMB    0
CORE_COURSE_NAME_1_F        0
CORE_COURSE_GRADE_1_F       0
CORE_COURSE_NAME_2_F        0
CORE_COURSE_GRADE_2_F       0
CORE_COURSE_NAME_3_F        0
CORE_COURSE_GRADE_3_F       0
CORE_COURSE_NAME_1_S        0
CORE_COURSE_GRADE_1_S       0
CORE_COURSE_NAME_2_S        0
CORE_COURSE_GRADE_2_S       0
HOUSING_STS                 0
RETURNED_2ND_YR             0
DISTANCE_FROM_HOME          0
HIGH_SCHL_GPA               0
HIGH_SCHL_NAME              0
FATHER_HI_EDU_CD            0
MOTHER_HI_EDU_CD            0
DEGREE_GROUP_CD             0
FIRST_TERM_ATTEMPT_HRS      0
FIRST_TERM_EARNED_HRS       0
SECOND_TERM_ATTEMPT_HRS     0
SECOND_TERM_EARNED_HRS      0
GROSS_FIN_NEED              0
COST_OF_ATTEND              0
EST_FAM_CONTRIBUTION        0
UNMET_NEED                  0
dtype: int64
In [31]:
# here it is 0 means attriting and 1 means not attriting
# It gives count for number of students left and return
df.RETURNED_2ND_YR.value_counts()
Out[31]:
1    2677
0     723
Name: RETURNED_2ND_YR, dtype: int64
In [32]:
# for model building purpose we are replacing 1 as attriting and 0 as not attriting
df["RETURNED_2ND_YR"].replace([0,1],[1,0], inplace=True)
In [33]:
df.RETURNED_2ND_YR.value_counts()
Out[33]:
0    2677
1     723
Name: RETURNED_2ND_YR, dtype: int64
In [34]:
df.describe()
Out[34]:
STDNT_AGE STDNT_BACKGROUND STDNT_TEST_ENTRANCE_COMB RETURNED_2ND_YR DISTANCE_FROM_HOME HIGH_SCHL_GPA HIGH_SCHL_NAME FATHER_HI_EDU_CD MOTHER_HI_EDU_CD FIRST_TERM_ATTEMPT_HRS FIRST_TERM_EARNED_HRS SECOND_TERM_ATTEMPT_HRS SECOND_TERM_EARNED_HRS GROSS_FIN_NEED COST_OF_ATTEND EST_FAM_CONTRIBUTION UNMET_NEED
count 3400.000000 3400.000000 3400.000000 3400.000000 3400.000000 3400.000000 3400.000000 3400.000000 3400.000000 3400.000000 3400.000000 3400.000000 3400.000000 3.400000e+03 3.400000e+03 3.400000e+03 3400.000000
mean 17.994118 1.902353 997.758501 0.212647 99.825185 3.202738 102.563824 2.489412 2.632941 13.985882 12.205000 14.285848 12.511125 3.034025e+05 5.515335e+05 3.446899e+05 0.633235
std 0.552052 1.304436 143.868831 0.409240 234.330439 0.451773 122.940017 0.663338 0.585675 1.554286 2.986294 2.039621 3.362101 4.836255e+05 6.064403e+05 7.871878e+05 0.748765
min 16.000000 1.000000 530.000000 0.000000 0.000000 0.000000 1.000000 1.000000 1.000000 9.000000 0.000000 2.000000 0.000000 0.000000e+00 0.000000e+00 0.000000e+00 0.000000
25% 18.000000 1.000000 910.000000 0.000000 69.000000 2.870000 11.000000 2.000000 2.000000 13.000000 11.000000 13.000000 11.000000 0.000000e+00 0.000000e+00 0.000000e+00 0.000000
50% 18.000000 1.000000 997.758501 0.000000 69.000000 3.202738 48.000000 2.000000 3.000000 14.000000 13.000000 14.285848 13.000000 0.000000e+00 1.505400e+05 0.000000e+00 0.000000
75% 18.000000 3.000000 1070.000000 0.000000 138.000000 3.550000 139.000000 3.000000 3.000000 15.000000 15.000000 16.000000 15.000000 5.995800e+05 1.192815e+06 3.241800e+05 1.000000
max 26.000000 8.000000 1510.000000 1.000000 5932.000000 4.000000 533.000000 4.000000 4.000000 21.000000 21.000000 23.000000 23.000000 2.124900e+06 2.124900e+06 5.999940e+06 2.000000
In [35]:
# Multiple violin plot
import plotly.express as px
fx = px.data.tips()
fig = px.violin(df, y='STDNT_AGE', x='RETURNED_2ND_YR', color='STDNT_GENDER', box=True, points='all', hover_data=df.columns)
fig.show()
In [36]:
df2=df.copy()
In [37]:
# Its new data frame to see all the std who left
df_new=df.copy()
df_new=df_new.query("RETURNED_2ND_YR=='1'")
In [38]:
#df_new only selected left students
px.parallel_coordinates(df_new,dimensions=["EST_FAM_CONTRIBUTION","GROSS_FIN_NEED","UNMET_NEED","COST_OF_ATTEND","RETURNED_2ND_YR"],color="RETURNED_2ND_YR",color_continuous_scale=["red","green","blue"])
In [39]:
px.parallel_categories(df_new,dimensions=["SECOND_TERM_EARNED_HRS","FIRST_TERM_EARNED_HRS","FIRST_TERM_ATTEMPT_HRS","SECOND_TERM_ATTEMPT_HRS","RETURNED_2ND_YR"],color="RETURNED_2ND_YR",color_continuous_scale=px.colors.sequential.Inferno)
In [40]:
px.parallel_categories(df_new,dimensions=["FATHER_HI_EDU_CD","MOTHER_HI_EDU_CD","DEGREE_GROUP_CD","RETURNED_2ND_YR"],color="RETURNED_2ND_YR",color_continuous_scale=px.colors.sequential.Inferno)
In [41]:
px.parallel_coordinates(df_new,dimensions=["HIGH_SCHL_GPA","HIGH_SCHL_NAME","RETURNED_2ND_YR"],color="RETURNED_2ND_YR",color_continuous_scale=["red","green","blue"])
In [42]:
px.parallel_categories(df,dimensions=["IN_STATE_FLAG","INTERNATIONAL_STS","RETURNED_2ND_YR"],color="RETURNED_2ND_YR",color_continuous_scale=px.colors.sequential.Inferno)
In [43]:
px.parallel_categories(df_new,dimensions=["STDNT_AGE","STDNT_GENDER","STDNT_BACKGROUND","RETURNED_2ND_YR"],color="RETURNED_2ND_YR",color_continuous_scale=px.colors.sequential.Inferno)
In [44]:
px.parallel_categories(df_new,dimensions=["STDNT_MAJOR","STDNT_MINOR","STDNT_TEST_ENTRANCE_COMB","RETURNED_2ND_YR"],color="RETURNED_2ND_YR",color_continuous_scale=px.colors.sequential.Inferno)
In [45]:
px.parallel_categories(df_new,dimensions=["CORE_COURSE_NAME_1_F","CORE_COURSE_NAME_2_F","RETURNED_2ND_YR"],color="RETURNED_2ND_YR",color_continuous_scale=px.colors.sequential.Inferno)
In [46]:
px.parallel_categories(df_new,dimensions=["CORE_COURSE_GRADE_1_F","CORE_COURSE_GRADE_2_F","CORE_COURSE_GRADE_3_F","RETURNED_2ND_YR"],color="RETURNED_2ND_YR",color_continuous_scale=px.colors.sequential.Inferno)
In [47]:
px.parallel_categories(df_new,dimensions=["CORE_COURSE_NAME_3_F","RETURNED_2ND_YR"],color="RETURNED_2ND_YR",color_continuous_scale=px.colors.sequential.Inferno)
In [48]:
px.parallel_categories(df_new,dimensions=["CORE_COURSE_NAME_1_S","CORE_COURSE_NAME_2_S","RETURNED_2ND_YR"],color="RETURNED_2ND_YR",color_continuous_scale=px.colors.sequential.Inferno)
In [49]:
px.parallel_categories(df_new,dimensions=["CORE_COURSE_GRADE_1_S","CORE_COURSE_GRADE_2_S","RETURNED_2ND_YR"],color="RETURNED_2ND_YR",color_continuous_scale=px.colors.sequential.Inferno)
In [50]:
px.parallel_categories(df_new,dimensions=["DISTANCE_FROM_HOME","HOUSING_STS","RETURNED_2ND_YR"],color="RETURNED_2ND_YR",color_continuous_scale=px.colors.sequential.Inferno)
In [51]:
# Outliers Detection by ploting Box plot
In [52]:
import plotly.express as px
fx = px.data.tips()
fig = px.violin(df, y='STDNT_AGE',box=True)
fig.show()
In [53]:
import plotly.express as px
fx = px.data.tips()
fig = px.violin(df, y='STDNT_BACKGROUND',box=True)
fig.show()
In [54]:
import plotly.express as px
fx = px.data.tips()
fig = px.violin(df, y='STDNT_TEST_ENTRANCE_COMB',box=True)
fig.show()
In [55]:
import plotly.express as px
fx = px.data.tips()
fig = px.box(df, y='DISTANCE_FROM_HOME')
fig.show()
In [56]:
import plotly.express as px
fx = px.data.tips()
fig = px.box(df, y='HIGH_SCHL_GPA')
fig.show()
In [57]:
import plotly.express as px
fx = px.data.tips()
fig = px.box(df, y='HIGH_SCHL_NAME')
fig.show()
In [58]:
import plotly.express as px
fx = px.data.tips()
fig = px.box(df, y='FATHER_HI_EDU_CD')
fig.show()
In [59]:
import plotly.express as px
fx = px.data.tips()
fig = px.box(df, y='MOTHER_HI_EDU_CD')
fig.show()
In [60]:
import plotly.express as px
fx = px.data.tips()
fig = px.box(df, y='SECOND_TERM_EARNED_HRS')
fig.show()
In [61]:
import plotly.express as px
fx = px.data.tips()
fig = px.box(df, y='FIRST_TERM_EARNED_HRS')
fig.show()
In [62]:
import plotly.express as px
fx = px.data.tips()
fig = px.box(df, y='GROSS_FIN_NEED')
fig.show()
In [63]:
import plotly.express as px
fx = px.data.tips()
fig = px.box(df, y='COST_OF_ATTEND')
fig.show()
In [64]:
import plotly.express as px
fx = px.data.tips()
fig = px.box(df, y='EST_FAM_CONTRIBUTION')
fig.show()
In [65]:
import plotly.express as px
fx = px.data.tips()
fig = px.box(df, y='UNMET_NEED')
fig.show()
In [66]:
#lets check how many levels each variable has
for col in df.columns:
    print(col,':',len(df[col].unique()),'labels')
STDNT_AGE : 11 labels
STDNT_GENDER : 2 labels
STDNT_BACKGROUND : 8 labels
IN_STATE_FLAG : 2 labels
INTERNATIONAL_STS : 2 labels
STDNT_MAJOR : 54 labels
STDNT_MINOR : 37 labels
STDNT_TEST_ENTRANCE_COMB : 25 labels
CORE_COURSE_NAME_1_F : 69 labels
CORE_COURSE_GRADE_1_F : 6 labels
CORE_COURSE_NAME_2_F : 74 labels
CORE_COURSE_GRADE_2_F : 6 labels
CORE_COURSE_NAME_3_F : 74 labels
CORE_COURSE_GRADE_3_F : 6 labels
CORE_COURSE_NAME_1_S : 66 labels
CORE_COURSE_GRADE_1_S : 7 labels
CORE_COURSE_NAME_2_S : 69 labels
CORE_COURSE_GRADE_2_S : 6 labels
HOUSING_STS : 2 labels
RETURNED_2ND_YR : 2 labels
DISTANCE_FROM_HOME : 72 labels
HIGH_SCHL_GPA : 201 labels
HIGH_SCHL_NAME : 439 labels
FATHER_HI_EDU_CD : 4 labels
MOTHER_HI_EDU_CD : 4 labels
DEGREE_GROUP_CD : 3 labels
FIRST_TERM_ATTEMPT_HRS : 12 labels
FIRST_TERM_EARNED_HRS : 22 labels
SECOND_TERM_ATTEMPT_HRS : 22 labels
SECOND_TERM_EARNED_HRS : 24 labels
GROSS_FIN_NEED : 927 labels
COST_OF_ATTEND : 120 labels
EST_FAM_CONTRIBUTION : 1236 labels
UNMET_NEED : 3 labels
In [67]:
# Empty list to store columns with categorical data
categorical = []
for col, value in df.iteritems():
    if value.dtype == 'object':
        categorical.append(col)
In [68]:
categorical
Out[68]:
['STDNT_GENDER',
 'IN_STATE_FLAG',
 'INTERNATIONAL_STS',
 'STDNT_MAJOR',
 'STDNT_MINOR',
 'CORE_COURSE_NAME_1_F',
 'CORE_COURSE_GRADE_1_F',
 'CORE_COURSE_NAME_2_F',
 'CORE_COURSE_GRADE_2_F',
 'CORE_COURSE_NAME_3_F',
 'CORE_COURSE_GRADE_3_F',
 'CORE_COURSE_NAME_1_S',
 'CORE_COURSE_GRADE_1_S',
 'CORE_COURSE_NAME_2_S',
 'CORE_COURSE_GRADE_2_S',
 'HOUSING_STS',
 'DEGREE_GROUP_CD']
In [69]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns # data visualization library  
import matplotlib.pyplot as plt
%matplotlib inline
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
import time
from subprocess import check_output

#correlation map
f,ax = plt.subplots(figsize=(14, 14))
sns.heatmap(df.corr(), annot=True, fmt= '.1f',ax=ax)
Out[69]:
<matplotlib.axes._subplots.AxesSubplot at 0xda74470>
In [70]:
# 50% or greater than 50%
#"COST_OF_ATTEND" is highly correlated with "UNMET_NEED","GROSS_FIN_NEED"  we will remove "COST_OF_ATTEND" 
#"SECOND_TERM_ATTEMPT_HRS" is highly correlated with "SECOND_TERM_EARNED_HRS"  we will keep only one "SECOND_TERM_EARNED_HRS"
#"FIRST_TERM_ATTEMPT_HRS" is highly correlated with "FIRST_TERM_EARNED_HRS" we will keep only one "FIRST_TERM_EARNED_HRS"
In [71]:
df2=df.copy()
In [72]:
#To address this you could plot a correlation matrix and then write down which features are correlated are remove them by hand,
#but there is a smarter way to check the
#correlation coefficient above 0.6. When doing so we should always remove the target variable, for obvious reasons. 
In [73]:
correlated_features = set()
correlation_matrix = df2.drop('RETURNED_2ND_YR', axis=1).corr()

for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.6:
            colname = correlation_matrix.columns[i]
            correlated_features.add(colname)
In [74]:
correlated_features
Out[74]:
{'COST_OF_ATTEND', 'SECOND_TERM_EARNED_HRS', 'UNMET_NEED'}
In [75]:
# 'HIGH_SCHL_NAME' is not required and 'SECOND_TERM_ATTEMPT_HRS','FIRST_TERM_ATTEMPT_HRS','GROSS_FIN_NEED','UNMET_NEED'
#drpped because of high correalation
df=df.drop(['HIGH_SCHL_NAME','SECOND_TERM_ATTEMPT_HRS','FIRST_TERM_ATTEMPT_HRS','GROSS_FIN_NEED','UNMET_NEED'],axis=1)
In [76]:
df3=df.copy()
In [77]:
x=df3.drop(["RETURNED_2ND_YR"],axis=1)
y=df3['RETURNED_2ND_YR']
x=pd.get_dummies(x)
In [78]:
print(x.shape)
print(y.shape)
(3400, 496)
(3400,)
In [79]:
x.columns
Out[79]:
Index(['STDNT_AGE', 'STDNT_BACKGROUND', 'STDNT_TEST_ENTRANCE_COMB',
       'DISTANCE_FROM_HOME', 'HIGH_SCHL_GPA', 'FATHER_HI_EDU_CD',
       'MOTHER_HI_EDU_CD', 'FIRST_TERM_EARNED_HRS', 'SECOND_TERM_EARNED_HRS',
       'COST_OF_ATTEND',
       ...
       'CORE_COURSE_GRADE_2_S_B', 'CORE_COURSE_GRADE_2_S_C',
       'CORE_COURSE_GRADE_2_S_D', 'CORE_COURSE_GRADE_2_S_F',
       'CORE_COURSE_GRADE_2_S_INCOMPL', 'HOUSING_STS_Off Campus',
       'HOUSING_STS_On Campus', 'DEGREE_GROUP_CD_A', 'DEGREE_GROUP_CD_B',
       'DEGREE_GROUP_CD_V'],
      dtype='object', length=496)

1.FILTERING METHOD

Pearson Correalation

In [80]:
def cor_selector(x, y):
    cor_list = []
    # calculate the correlation with y for each feature
    for i in x.columns.tolist():
        cor = np.corrcoef(x[i], y)[0, 1]
        cor_list.append(cor)
   
    # feature name
    cor_feature = x.iloc[:,np.argsort(np.abs(cor_list))[-100:]].columns.tolist()
    # feature selection? 0 for not select, 1 for select
    cor_support = [True if i in cor_feature else False for i in x.columns.tolist()]
    return cor_support, cor_feature
In [81]:
cor_support, cor_feature = cor_selector(x, y)
print(str(len(cor_feature)), 'selected features')
100 selected features
In [82]:
cor_feature
Out[82]:
['STDNT_MAJOR_Music',
 'CORE_COURSE_NAME_2_F_ENGL 1102',
 'STDNT_MAJOR_Theatre Education',
 'STDNT_MAJOR_Finance',
 'CORE_COURSE_NAME_1_F_GEOL 1121',
 'STDNT_MAJOR_Health Science',
 'CORE_COURSE_NAME_2_S_SPAN 1001',
 'CORE_COURSE_GRADE_1_F_A',
 'CORE_COURSE_NAME_2_F_CHEM 1151L',
 'CORE_COURSE_NAME_1_S_HIST 2112',
 'CORE_COURSE_NAME_1_F_COMM 1110',
 'CORE_COURSE_NAME_1_F_ANTH 1105I',
 'CORE_COURSE_NAME_1_S_MUSC 1100',
 'DISTANCE_FROM_HOME',
 'STDNT_MAJOR_Undeclared',
 'STDNT_MAJOR_General Business',
 'CORE_COURSE_NAME_2_S_MATH 1131',
 'CORE_COURSE_NAME_2_F_ASTR 1105',
 'FATHER_HI_EDU_CD',
 'CORE_COURSE_NAME_1_S_SPAN 1001',
 'CORE_COURSE_NAME_3_F_COMM 1110',
 'STDNT_GENDER_F',
 'STDNT_GENDER_M',
 'CORE_COURSE_NAME_3_F_BIOL 1215K',
 'CORE_COURSE_NAME_3_F_THEA 1100',
 'STDNT_MAJOR_Pre-Nursing',
 'CORE_COURSE_NAME_1_S_HIST 1111',
 'CORE_COURSE_NAME_1_S_HIST 1112',
 'CORE_COURSE_NAME_3_F_SPAN 1001',
 'CORE_COURSE_NAME_2_S_ECON 2106',
 'CORE_COURSE_NAME_2_S_ITDS 2735',
 'CORE_COURSE_NAME_1_S_MATH 1111',
 'CORE_COURSE_NAME_1_S_COMM 1110',
 'CORE_COURSE_NAME_3_F_ECON 2106',
 'CORE_COURSE_NAME_3_F_ENGL 2136',
 'CORE_COURSE_NAME_1_F_ENGL 2111',
 'CORE_COURSE_NAME_2_F_PHYS 2311',
 'CORE_COURSE_NAME_3_F_PHYS 2211',
 'CORE_COURSE_NAME_2_F_MATH 2125',
 "STDNT_MINOR_Women's Studies",
 'CORE_COURSE_NAME_2_S_MATH 2125',
 'CORE_COURSE_NAME_3_F_LATN 1002',
 'CORE_COURSE_NAME_3_F_THEA 1100I',
 'CORE_COURSE_NAME_2_S_COMM 1110',
 'STDNT_MAJOR_Psychology',
 'CORE_COURSE_NAME_1_F_ECON 2106',
 'STDNT_MAJOR_Criminal Justice',
 'STDNT_MAJOR_Biology and Secondary Ed',
 'CORE_COURSE_NAME_1_S_CPSC 1301L',
 'STDNT_MAJOR_Computer Science - Games',
 'CORE_COURSE_NAME_2_F_MATH 1113',
 'CORE_COURSE_NAME_2_S_POLS 1101',
 'STDNT_MAJOR_Music Performance',
 'CORE_COURSE_NAME_2_F_POLS 1101',
 'STDNT_BACKGROUND',
 'IN_STATE_FLAG_Y',
 'IN_STATE_FLAG_N',
 'CORE_COURSE_NAME_2_S_CPSC 1105',
 'CORE_COURSE_NAME_1_S_BIOL 1215K',
 'CORE_COURSE_NAME_3_F_ITDS 2735',
 'CORE_COURSE_NAME_2_S_MATH 1125',
 'CORE_COURSE_NAME_3_F_ENGL 1102',
 'CORE_COURSE_NAME_2_F_ENVS 1105I',
 'CORE_COURSE_NAME_3_F_ANTH 1105',
 'HOUSING_STS_On Campus',
 'HOUSING_STS_Off Campus',
 'INTERNATIONAL_STS_N',
 'INTERNATIONAL_STS_Y',
 'CORE_COURSE_NAME_3_F_ENGL 1101I',
 'STDNT_MAJOR_Joint Enrollment - Accel',
 'CORE_COURSE_GRADE_2_S_B',
 'CORE_COURSE_GRADE_2_F_D',
 'STDNT_MAJOR_Music Education',
 'CORE_COURSE_GRADE_2_S_C',
 'STDNT_MINOR_English Language/Literature',
 'CORE_COURSE_GRADE_2_S_D',
 'STDNT_MAJOR_Theatre Arts',
 'CORE_COURSE_GRADE_1_S_A',
 'CORE_COURSE_GRADE_3_F_D',
 'CORE_COURSE_GRADE_1_F_D',
 'FIRST_TERM_EARNED_HRS',
 'CORE_COURSE_GRADE_3_F_C',
 'CORE_COURSE_GRADE_1_S_Unknown',
 'CORE_COURSE_GRADE_2_F_B',
 'CORE_COURSE_GRADE_2_F_C',
 'CORE_COURSE_GRADE_2_F_A',
 'HIGH_SCHL_GPA',
 'CORE_COURSE_NAME_1_S_ENGL 1102',
 'CORE_COURSE_GRADE_3_F_A',
 'CORE_COURSE_GRADE_1_S_F',
 'CORE_COURSE_GRADE_3_F_F',
 'CORE_COURSE_GRADE_2_S_A',
 'CORE_COURSE_GRADE_1_F_B',
 'CORE_COURSE_GRADE_1_S_C',
 'CORE_COURSE_GRADE_2_F_F',
 'SECOND_TERM_EARNED_HRS',
 'CORE_COURSE_GRADE_2_S_F',
 'CORE_COURSE_GRADE_1_F_F',
 'CORE_COURSE_NAME_2_S_ENGL 1102',
 'CORE_COURSE_GRADE_1_S_B']
In [83]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
x_norm = MinMaxScaler().fit_transform(x)
chi_selector = SelectKBest(chi2, k=100)
chi_selector.fit(x_norm, y)
Out[83]:
SelectKBest(k=100, score_func=<function chi2 at 0x000000000DEF71E0>)
In [84]:
chi_support = chi_selector.get_support()
chi_feature = x.loc[:,chi_support].columns.tolist()
print(str(len(chi_feature)), 'selected features')
100 selected features
In [85]:
#Wrapper Method
In [86]:
import warnings
warnings.filterwarnings("ignore")
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
rfe_selector = RFE(estimator=LogisticRegression(), n_features_to_select=100, step=1, verbose=5)
rfe_selector.fit(x_norm, y)
Fitting estimator with 496 features.
Fitting estimator with 495 features.
Fitting estimator with 494 features.
Fitting estimator with 493 features.
Fitting estimator with 492 features.
Fitting estimator with 491 features.
Fitting estimator with 490 features.
Fitting estimator with 489 features.
Fitting estimator with 488 features.
Fitting estimator with 487 features.
Fitting estimator with 486 features.
Fitting estimator with 485 features.
Fitting estimator with 484 features.
Fitting estimator with 483 features.
Fitting estimator with 482 features.
Fitting estimator with 481 features.
Fitting estimator with 480 features.
Fitting estimator with 479 features.
Fitting estimator with 478 features.
Fitting estimator with 477 features.
Fitting estimator with 476 features.
Fitting estimator with 475 features.
Fitting estimator with 474 features.
Fitting estimator with 473 features.
Fitting estimator with 472 features.
Fitting estimator with 471 features.
Fitting estimator with 470 features.
Fitting estimator with 469 features.
Fitting estimator with 468 features.
Fitting estimator with 467 features.
Fitting estimator with 466 features.
Fitting estimator with 465 features.
Fitting estimator with 464 features.
Fitting estimator with 463 features.
Fitting estimator with 462 features.
Fitting estimator with 461 features.
Fitting estimator with 460 features.
Fitting estimator with 459 features.
Fitting estimator with 458 features.
Fitting estimator with 457 features.
Fitting estimator with 456 features.
Fitting estimator with 455 features.
Fitting estimator with 454 features.
Fitting estimator with 453 features.
Fitting estimator with 452 features.
Fitting estimator with 451 features.
Fitting estimator with 450 features.
Fitting estimator with 449 features.
Fitting estimator with 448 features.
Fitting estimator with 447 features.
Fitting estimator with 446 features.
Fitting estimator with 445 features.
Fitting estimator with 444 features.
Fitting estimator with 443 features.
Fitting estimator with 442 features.
Fitting estimator with 441 features.
Fitting estimator with 440 features.
Fitting estimator with 439 features.
Fitting estimator with 438 features.
Fitting estimator with 437 features.
Fitting estimator with 436 features.
Fitting estimator with 435 features.
Fitting estimator with 434 features.
Fitting estimator with 433 features.
Fitting estimator with 432 features.
Fitting estimator with 431 features.
Fitting estimator with 430 features.
Fitting estimator with 429 features.
Fitting estimator with 428 features.
Fitting estimator with 427 features.
Fitting estimator with 426 features.
Fitting estimator with 425 features.
Fitting estimator with 424 features.
Fitting estimator with 423 features.
Fitting estimator with 422 features.
Fitting estimator with 421 features.
Fitting estimator with 420 features.
Fitting estimator with 419 features.
Fitting estimator with 418 features.
Fitting estimator with 417 features.
Fitting estimator with 416 features.
Fitting estimator with 415 features.
Fitting estimator with 414 features.
Fitting estimator with 413 features.
Fitting estimator with 412 features.
Fitting estimator with 411 features.
Fitting estimator with 410 features.
Fitting estimator with 409 features.
Fitting estimator with 408 features.
Fitting estimator with 407 features.
Fitting estimator with 406 features.
Fitting estimator with 405 features.
Fitting estimator with 404 features.
Fitting estimator with 403 features.
Fitting estimator with 402 features.
Fitting estimator with 401 features.
Fitting estimator with 400 features.
Fitting estimator with 399 features.
Fitting estimator with 398 features.
Fitting estimator with 397 features.
Fitting estimator with 396 features.
Fitting estimator with 395 features.
Fitting estimator with 394 features.
Fitting estimator with 393 features.
Fitting estimator with 392 features.
Fitting estimator with 391 features.
Fitting estimator with 390 features.
Fitting estimator with 389 features.
Fitting estimator with 388 features.
Fitting estimator with 387 features.
Fitting estimator with 386 features.
Fitting estimator with 385 features.
Fitting estimator with 384 features.
Fitting estimator with 383 features.
Fitting estimator with 382 features.
Fitting estimator with 381 features.
Fitting estimator with 380 features.
Fitting estimator with 379 features.
Fitting estimator with 378 features.
Fitting estimator with 377 features.
Fitting estimator with 376 features.
Fitting estimator with 375 features.
Fitting estimator with 374 features.
Fitting estimator with 373 features.
Fitting estimator with 372 features.
Fitting estimator with 371 features.
Fitting estimator with 370 features.
Fitting estimator with 369 features.
Fitting estimator with 368 features.
Fitting estimator with 367 features.
Fitting estimator with 366 features.
Fitting estimator with 365 features.
Fitting estimator with 364 features.
Fitting estimator with 363 features.
Fitting estimator with 362 features.
Fitting estimator with 361 features.
Fitting estimator with 360 features.
Fitting estimator with 359 features.
Fitting estimator with 358 features.
Fitting estimator with 357 features.
Fitting estimator with 356 features.
Fitting estimator with 355 features.
Fitting estimator with 354 features.
Fitting estimator with 353 features.
Fitting estimator with 352 features.
Fitting estimator with 351 features.
Fitting estimator with 350 features.
Fitting estimator with 349 features.
Fitting estimator with 348 features.
Fitting estimator with 347 features.
Fitting estimator with 346 features.
Fitting estimator with 345 features.
Fitting estimator with 344 features.
Fitting estimator with 343 features.
Fitting estimator with 342 features.
Fitting estimator with 341 features.
Fitting estimator with 340 features.
Fitting estimator with 339 features.
Fitting estimator with 338 features.
Fitting estimator with 337 features.
Fitting estimator with 336 features.
Fitting estimator with 335 features.
Fitting estimator with 334 features.
Fitting estimator with 333 features.
Fitting estimator with 332 features.
Fitting estimator with 331 features.
Fitting estimator with 330 features.
Fitting estimator with 329 features.
Fitting estimator with 328 features.
Fitting estimator with 327 features.
Fitting estimator with 326 features.
Fitting estimator with 325 features.
Fitting estimator with 324 features.
Fitting estimator with 323 features.
Fitting estimator with 322 features.
Fitting estimator with 321 features.
Fitting estimator with 320 features.
Fitting estimator with 319 features.
Fitting estimator with 318 features.
Fitting estimator with 317 features.
Fitting estimator with 316 features.
Fitting estimator with 315 features.
Fitting estimator with 314 features.
Fitting estimator with 313 features.
Fitting estimator with 312 features.
Fitting estimator with 311 features.
Fitting estimator with 310 features.
Fitting estimator with 309 features.
Fitting estimator with 308 features.
Fitting estimator with 307 features.
Fitting estimator with 306 features.
Fitting estimator with 305 features.
Fitting estimator with 304 features.
Fitting estimator with 303 features.
Fitting estimator with 302 features.
Fitting estimator with 301 features.
Fitting estimator with 300 features.
Fitting estimator with 299 features.
Fitting estimator with 298 features.
Fitting estimator with 297 features.
Fitting estimator with 296 features.
Fitting estimator with 295 features.
Fitting estimator with 294 features.
Fitting estimator with 293 features.
Fitting estimator with 292 features.
Fitting estimator with 291 features.
Fitting estimator with 290 features.
Fitting estimator with 289 features.
Fitting estimator with 288 features.
Fitting estimator with 287 features.
Fitting estimator with 286 features.
Fitting estimator with 285 features.
Fitting estimator with 284 features.
Fitting estimator with 283 features.
Fitting estimator with 282 features.
Fitting estimator with 281 features.
Fitting estimator with 280 features.
Fitting estimator with 279 features.
Fitting estimator with 278 features.
Fitting estimator with 277 features.
Fitting estimator with 276 features.
Fitting estimator with 275 features.
Fitting estimator with 274 features.
Fitting estimator with 273 features.
Fitting estimator with 272 features.
Fitting estimator with 271 features.
Fitting estimator with 270 features.
Fitting estimator with 269 features.
Fitting estimator with 268 features.
Fitting estimator with 267 features.
Fitting estimator with 266 features.
Fitting estimator with 265 features.
Fitting estimator with 264 features.
Fitting estimator with 263 features.
Fitting estimator with 262 features.
Fitting estimator with 261 features.
Fitting estimator with 260 features.
Fitting estimator with 259 features.
Fitting estimator with 258 features.
Fitting estimator with 257 features.
Fitting estimator with 256 features.
Fitting estimator with 255 features.
Fitting estimator with 254 features.
Fitting estimator with 253 features.
Fitting estimator with 252 features.
Fitting estimator with 251 features.
Fitting estimator with 250 features.
Fitting estimator with 249 features.
Fitting estimator with 248 features.
Fitting estimator with 247 features.
Fitting estimator with 246 features.
Fitting estimator with 245 features.
Fitting estimator with 244 features.
Fitting estimator with 243 features.
Fitting estimator with 242 features.
Fitting estimator with 241 features.
Fitting estimator with 240 features.
Fitting estimator with 239 features.
Fitting estimator with 238 features.
Fitting estimator with 237 features.
Fitting estimator with 236 features.
Fitting estimator with 235 features.
Fitting estimator with 234 features.
Fitting estimator with 233 features.
Fitting estimator with 232 features.
Fitting estimator with 231 features.
Fitting estimator with 230 features.
Fitting estimator with 229 features.
Fitting estimator with 228 features.
Fitting estimator with 227 features.
Fitting estimator with 226 features.
Fitting estimator with 225 features.
Fitting estimator with 224 features.
Fitting estimator with 223 features.
Fitting estimator with 222 features.
Fitting estimator with 221 features.
Fitting estimator with 220 features.
Fitting estimator with 219 features.
Fitting estimator with 218 features.
Fitting estimator with 217 features.
Fitting estimator with 216 features.
Fitting estimator with 215 features.
Fitting estimator with 214 features.
Fitting estimator with 213 features.
Fitting estimator with 212 features.
Fitting estimator with 211 features.
Fitting estimator with 210 features.
Fitting estimator with 209 features.
Fitting estimator with 208 features.
Fitting estimator with 207 features.
Fitting estimator with 206 features.
Fitting estimator with 205 features.
Fitting estimator with 204 features.
Fitting estimator with 203 features.
Fitting estimator with 202 features.
Fitting estimator with 201 features.
Fitting estimator with 200 features.
Fitting estimator with 199 features.
Fitting estimator with 198 features.
Fitting estimator with 197 features.
Fitting estimator with 196 features.
Fitting estimator with 195 features.
Fitting estimator with 194 features.
Fitting estimator with 193 features.
Fitting estimator with 192 features.
Fitting estimator with 191 features.
Fitting estimator with 190 features.
Fitting estimator with 189 features.
Fitting estimator with 188 features.
Fitting estimator with 187 features.
Fitting estimator with 186 features.
Fitting estimator with 185 features.
Fitting estimator with 184 features.
Fitting estimator with 183 features.
Fitting estimator with 182 features.
Fitting estimator with 181 features.
Fitting estimator with 180 features.
Fitting estimator with 179 features.
Fitting estimator with 178 features.
Fitting estimator with 177 features.
Fitting estimator with 176 features.
Fitting estimator with 175 features.
Fitting estimator with 174 features.
Fitting estimator with 173 features.
Fitting estimator with 172 features.
Fitting estimator with 171 features.
Fitting estimator with 170 features.
Fitting estimator with 169 features.
Fitting estimator with 168 features.
Fitting estimator with 167 features.
Fitting estimator with 166 features.
Fitting estimator with 165 features.
Fitting estimator with 164 features.
Fitting estimator with 163 features.
Fitting estimator with 162 features.
Fitting estimator with 161 features.
Fitting estimator with 160 features.
Fitting estimator with 159 features.
Fitting estimator with 158 features.
Fitting estimator with 157 features.
Fitting estimator with 156 features.
Fitting estimator with 155 features.
Fitting estimator with 154 features.
Fitting estimator with 153 features.
Fitting estimator with 152 features.
Fitting estimator with 151 features.
Fitting estimator with 150 features.
Fitting estimator with 149 features.
Fitting estimator with 148 features.
Fitting estimator with 147 features.
Fitting estimator with 146 features.
Fitting estimator with 145 features.
Fitting estimator with 144 features.
Fitting estimator with 143 features.
Fitting estimator with 142 features.
Fitting estimator with 141 features.
Fitting estimator with 140 features.
Fitting estimator with 139 features.
Fitting estimator with 138 features.
Fitting estimator with 137 features.
Fitting estimator with 136 features.
Fitting estimator with 135 features.
Fitting estimator with 134 features.
Fitting estimator with 133 features.
Fitting estimator with 132 features.
Fitting estimator with 131 features.
Fitting estimator with 130 features.
Fitting estimator with 129 features.
Fitting estimator with 128 features.
Fitting estimator with 127 features.
Fitting estimator with 126 features.
Fitting estimator with 125 features.
Fitting estimator with 124 features.
Fitting estimator with 123 features.
Fitting estimator with 122 features.
Fitting estimator with 121 features.
Fitting estimator with 120 features.
Fitting estimator with 119 features.
Fitting estimator with 118 features.
Fitting estimator with 117 features.
Fitting estimator with 116 features.
Fitting estimator with 115 features.
Fitting estimator with 114 features.
Fitting estimator with 113 features.
Fitting estimator with 112 features.
Fitting estimator with 111 features.
Fitting estimator with 110 features.
Fitting estimator with 109 features.
Fitting estimator with 108 features.
Fitting estimator with 107 features.
Fitting estimator with 106 features.
Fitting estimator with 105 features.
Fitting estimator with 104 features.
Fitting estimator with 103 features.
Fitting estimator with 102 features.
Fitting estimator with 101 features.
Out[86]:
RFE(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
  n_features_to_select=100, step=1, verbose=5)
In [87]:
rfe_support = rfe_selector.get_support()
rfe_feature = x.loc[:,rfe_support].columns.tolist()
print(str(len(rfe_feature)), 'selected features')
100 selected features
In [88]:
rfe_feature
Out[88]:
['STDNT_BACKGROUND',
 'STDNT_TEST_ENTRANCE_COMB',
 'DISTANCE_FROM_HOME',
 'FIRST_TERM_EARNED_HRS',
 'INTERNATIONAL_STS_N',
 'STDNT_MAJOR_Biology and Secondary Ed',
 'STDNT_MAJOR_Early Admission - Accel',
 'STDNT_MAJOR_Health Science',
 'STDNT_MAJOR_Joint Enrollment - Accel',
 'STDNT_MAJOR_Music Education',
 'STDNT_MAJOR_Music Performance',
 'STDNT_MAJOR_Spanish',
 'STDNT_MAJOR_Theatre Arts',
 'STDNT_MAJOR_Theatre Education',
 'STDNT_MINOR_African Studies',
 'STDNT_MINOR_Communication',
 'STDNT_MINOR_English Language/Literature',
 'STDNT_MINOR_Mathematics',
 'STDNT_MINOR_Professional Writing',
 "STDNT_MINOR_Women's Studies",
 'CORE_COURSE_NAME_1_F_ANTH 1105I',
 'CORE_COURSE_NAME_1_F_ARTH 1100I',
 'CORE_COURSE_NAME_1_F_ASTR 1105',
 'CORE_COURSE_NAME_1_F_ECON 2106',
 'CORE_COURSE_NAME_1_F_ENGL 2111',
 'CORE_COURSE_NAME_1_F_GEOL 1121',
 'CORE_COURSE_NAME_1_F_ITDS 2748',
 'CORE_COURSE_NAME_1_F_MATH 1132',
 'CORE_COURSE_NAME_1_F_POLS 2401',
 'CORE_COURSE_NAME_1_F_SPAN 1001',
 'CORE_COURSE_GRADE_1_F_B',
 'CORE_COURSE_GRADE_1_F_F',
 'CORE_COURSE_NAME_2_F_ASTR 1105',
 'CORE_COURSE_NAME_2_F_COMM 1110I',
 'CORE_COURSE_NAME_2_F_ENGL 1101I',
 'CORE_COURSE_NAME_2_F_ENVS 1105I',
 'CORE_COURSE_NAME_2_F_FREN 1001',
 'CORE_COURSE_NAME_2_F_GEOL 1110',
 'CORE_COURSE_NAME_2_F_GEOL 1121',
 'CORE_COURSE_NAME_2_F_HIST 1112I',
 'CORE_COURSE_NAME_2_F_ITDS 1155',
 'CORE_COURSE_NAME_2_F_MATH 1113',
 'CORE_COURSE_NAME_2_F_MATH 2125',
 'CORE_COURSE_NAME_2_F_PHYS 2311',
 'CORE_COURSE_NAME_2_F_POLS 1101',
 'CORE_COURSE_GRADE_2_F_F',
 'CORE_COURSE_NAME_3_F_ANTH 1105',
 'CORE_COURSE_NAME_3_F_BIOL 1215K',
 'CORE_COURSE_NAME_3_F_CHEM 1211',
 'CORE_COURSE_NAME_3_F_COMM 1110',
 'CORE_COURSE_NAME_3_F_COMM 1110H',
 'CORE_COURSE_NAME_3_F_CPSC 1105',
 'CORE_COURSE_NAME_3_F_CPSC 1301',
 'CORE_COURSE_NAME_3_F_ECON 2105',
 'CORE_COURSE_NAME_3_F_ECON 2106',
 'CORE_COURSE_NAME_3_F_ENGL 1101I',
 'CORE_COURSE_NAME_3_F_ENGL 1102',
 'CORE_COURSE_NAME_3_F_ENGL 2136',
 'CORE_COURSE_NAME_3_F_FREN 1001',
 'CORE_COURSE_NAME_3_F_GEOL 1121L',
 'CORE_COURSE_NAME_3_F_ITDS 1145',
 'CORE_COURSE_NAME_3_F_ITDS 1155',
 'CORE_COURSE_NAME_3_F_ITDS 2735',
 'CORE_COURSE_NAME_3_F_ITDS 2746',
 'CORE_COURSE_NAME_3_F_JAPN 1001',
 'CORE_COURSE_NAME_3_F_LATN 1002',
 'CORE_COURSE_NAME_3_F_POLS 1101H',
 'CORE_COURSE_NAME_3_F_SPAN 1001',
 'CORE_COURSE_NAME_3_F_SPAN 1002',
 'CORE_COURSE_NAME_3_F_THEA 1100I',
 'CORE_COURSE_GRADE_3_F_F',
 'CORE_COURSE_NAME_1_S_BIOL 1225K',
 'CORE_COURSE_NAME_1_S_CHEM 1211',
 'CORE_COURSE_NAME_1_S_CPSC 1301L',
 'CORE_COURSE_NAME_1_S_ENGL 1102',
 'CORE_COURSE_NAME_1_S_ENVS 1105',
 'CORE_COURSE_NAME_1_S_FREN 1002',
 'CORE_COURSE_NAME_1_S_FREN 2001',
 'CORE_COURSE_NAME_1_S_HIST 1111',
 'CORE_COURSE_NAME_1_S_HIST 1112',
 'CORE_COURSE_NAME_1_S_MATH 1113',
 'CORE_COURSE_NAME_1_S_SPAN 1001',
 'CORE_COURSE_GRADE_1_S_A',
 'CORE_COURSE_GRADE_1_S_B',
 'CORE_COURSE_GRADE_1_S_Unknown',
 'CORE_COURSE_NAME_2_S_CHEM 1211',
 'CORE_COURSE_NAME_2_S_CPSC 1105',
 'CORE_COURSE_NAME_2_S_ECON 2106',
 'CORE_COURSE_NAME_2_S_ENGL 1102',
 'CORE_COURSE_NAME_2_S_FREN 1002',
 'CORE_COURSE_NAME_2_S_GEOG 1101I',
 'CORE_COURSE_NAME_2_S_ITDS 2726',
 'CORE_COURSE_NAME_2_S_LATN 1002',
 'CORE_COURSE_NAME_2_S_MATH 1125',
 'CORE_COURSE_NAME_2_S_MATH 1131',
 'CORE_COURSE_NAME_2_S_MATH 2125',
 'CORE_COURSE_NAME_2_S_SPAN 1001',
 'CORE_COURSE_GRADE_2_S_C',
 'CORE_COURSE_GRADE_2_S_F',
 'CORE_COURSE_GRADE_2_S_INCOMPL']
In [89]:
#Embeded Method
In [90]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression

embeded_lr_selector = SelectFromModel(LogisticRegression(penalty="l2"), '1.25*median')
embeded_lr_selector.fit(x_norm,y)
Out[90]:
SelectFromModel(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
        norm_order=1, prefit=False, threshold='1.25*median')
In [91]:
embeded_lr_support = embeded_lr_selector.get_support()
embeded_lr_feature = x.loc[:,embeded_lr_support].columns.tolist()
print(str(len(embeded_lr_feature)), 'selected features')
199 selected features
In [92]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier

embeded_rf_selector = SelectFromModel(RandomForestClassifier(n_estimators=100), threshold='1.25*median')
embeded_rf_selector.fit(x, y)
Out[92]:
SelectFromModel(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
        norm_order=1, prefit=False, threshold='1.25*median')
In [93]:
embeded_rf_support = embeded_rf_selector.get_support()
embeded_rf_feature = x.loc[:,embeded_rf_support].columns.tolist()
print(str(len(embeded_rf_feature)), 'selected features')
221 selected features
In [94]:
embeded_rf_feature
Out[94]:
['STDNT_AGE',
 'STDNT_BACKGROUND',
 'STDNT_TEST_ENTRANCE_COMB',
 'DISTANCE_FROM_HOME',
 'HIGH_SCHL_GPA',
 'FATHER_HI_EDU_CD',
 'MOTHER_HI_EDU_CD',
 'FIRST_TERM_EARNED_HRS',
 'SECOND_TERM_EARNED_HRS',
 'COST_OF_ATTEND',
 'EST_FAM_CONTRIBUTION',
 'STDNT_GENDER_F',
 'STDNT_GENDER_M',
 'IN_STATE_FLAG_N',
 'IN_STATE_FLAG_Y',
 'INTERNATIONAL_STS_N',
 'INTERNATIONAL_STS_Y',
 'STDNT_MAJOR_Accounting',
 'STDNT_MAJOR_Applied Computer Science',
 'STDNT_MAJOR_Art',
 'STDNT_MAJOR_Biology',
 'STDNT_MAJOR_Biology and Secondary Ed',
 'STDNT_MAJOR_Chemistry',
 'STDNT_MAJOR_Communication',
 'STDNT_MAJOR_Computer Science - Games',
 'STDNT_MAJOR_Computer Science - Systems',
 'STDNT_MAJOR_Criminal Justice',
 'STDNT_MAJOR_Early Childhood Education',
 'STDNT_MAJOR_Engineering Studies',
 'STDNT_MAJOR_English Language/Literature',
 'STDNT_MAJOR_Exercise Science',
 'STDNT_MAJOR_Finance',
 'STDNT_MAJOR_General Business',
 'STDNT_MAJOR_Health Science',
 'STDNT_MAJOR_Health and Physical Education',
 'STDNT_MAJOR_History',
 'STDNT_MAJOR_History and Secondary Ed',
 'STDNT_MAJOR_Management',
 'STDNT_MAJOR_Management Information Systems',
 'STDNT_MAJOR_Marketing',
 'STDNT_MAJOR_Music',
 'STDNT_MAJOR_Music Education',
 'STDNT_MAJOR_Music Performance',
 'STDNT_MAJOR_Nursing',
 'STDNT_MAJOR_Political Science',
 'STDNT_MAJOR_Pre-Business',
 'STDNT_MAJOR_Pre-Nursing',
 'STDNT_MAJOR_Psychology',
 'STDNT_MAJOR_Theatre Arts',
 'STDNT_MAJOR_Undeclared',
 'STDNT_MINOR_English Language/Literature',
 'STDNT_MINOR_N',
 'STDNT_MINOR_Psychology',
 'STDNT_MINOR_Sociology',
 'CORE_COURSE_NAME_1_F_ANTH 1105',
 'CORE_COURSE_NAME_1_F_ARTH 1100',
 'CORE_COURSE_NAME_1_F_BIOL 1125',
 'CORE_COURSE_NAME_1_F_BIOL 1215K',
 'CORE_COURSE_NAME_1_F_CHEM 1151',
 'CORE_COURSE_NAME_1_F_CHEM 1151L',
 'CORE_COURSE_NAME_1_F_CHEM 1211',
 'CORE_COURSE_NAME_1_F_COMM 1110',
 'CORE_COURSE_NAME_1_F_CPSC 1105',
 'CORE_COURSE_NAME_1_F_CPSC 1301',
 'CORE_COURSE_NAME_1_F_ECON 2105',
 'CORE_COURSE_NAME_1_F_ECON 2106',
 'CORE_COURSE_NAME_1_F_ENGL 1101',
 'CORE_COURSE_NAME_1_F_FREN 1001',
 'CORE_COURSE_NAME_1_F_HIST 1111',
 'CORE_COURSE_NAME_1_F_HIST 1112I',
 'CORE_COURSE_NAME_1_F_HIST 2111',
 'CORE_COURSE_NAME_1_F_HIST 2112',
 'CORE_COURSE_NAME_1_F_ITDS 2735',
 'CORE_COURSE_NAME_1_F_MATH 1101',
 'CORE_COURSE_NAME_1_F_MATH 1111',
 'CORE_COURSE_NAME_1_F_MATH 1113',
 'CORE_COURSE_NAME_1_F_MUSC 1100',
 'CORE_COURSE_NAME_1_F_POLS 1101',
 'CORE_COURSE_NAME_1_F_PSYC 1101',
 'CORE_COURSE_NAME_1_F_SOCI 1101',
 'CORE_COURSE_NAME_1_F_SPAN 1001',
 'CORE_COURSE_NAME_1_F_THEA 1100',
 'CORE_COURSE_GRADE_1_F_A',
 'CORE_COURSE_GRADE_1_F_B',
 'CORE_COURSE_GRADE_1_F_C',
 'CORE_COURSE_GRADE_1_F_D',
 'CORE_COURSE_GRADE_1_F_F',
 'CORE_COURSE_GRADE_1_F_NOT REP',
 'CORE_COURSE_NAME_2_F_ANTH 1105',
 'CORE_COURSE_NAME_2_F_ARTH 1100',
 'CORE_COURSE_NAME_2_F_BIOL 1125',
 'CORE_COURSE_NAME_2_F_BIOL 1215K',
 'CORE_COURSE_NAME_2_F_CHEM 1151',
 'CORE_COURSE_NAME_2_F_CHEM 1151L',
 'CORE_COURSE_NAME_2_F_CHEM 1211',
 'CORE_COURSE_NAME_2_F_CHEM 1211L',
 'CORE_COURSE_NAME_2_F_COMM 1110',
 'CORE_COURSE_NAME_2_F_CPSC 1301',
 'CORE_COURSE_NAME_2_F_ECON 2105',
 'CORE_COURSE_NAME_2_F_ECON 2106',
 'CORE_COURSE_NAME_2_F_ENGL 1101',
 'CORE_COURSE_NAME_2_F_FREN 1001',
 'CORE_COURSE_NAME_2_F_HIST 1111',
 'CORE_COURSE_NAME_2_F_HIST 1112',
 'CORE_COURSE_NAME_2_F_HIST 2111',
 'CORE_COURSE_NAME_2_F_HIST 2112',
 'CORE_COURSE_NAME_2_F_ITDS 2735',
 'CORE_COURSE_NAME_2_F_MATH 1101',
 'CORE_COURSE_NAME_2_F_MATH 1111',
 'CORE_COURSE_NAME_2_F_MUSC 1100',
 'CORE_COURSE_NAME_2_F_POLS 1101',
 'CORE_COURSE_NAME_2_F_PSYC 1101',
 'CORE_COURSE_NAME_2_F_SOCI 1101',
 'CORE_COURSE_NAME_2_F_SPAN 1001',
 'CORE_COURSE_NAME_2_F_THEA 1100',
 'CORE_COURSE_GRADE_2_F_A',
 'CORE_COURSE_GRADE_2_F_B',
 'CORE_COURSE_GRADE_2_F_C',
 'CORE_COURSE_GRADE_2_F_D',
 'CORE_COURSE_GRADE_2_F_F',
 'CORE_COURSE_NAME_3_F_ANTH 1105',
 'CORE_COURSE_NAME_3_F_ARTH 1100',
 'CORE_COURSE_NAME_3_F_BIOL 1215K',
 'CORE_COURSE_NAME_3_F_CHEM 1151',
 'CORE_COURSE_NAME_3_F_CHEM 1151L',
 'CORE_COURSE_NAME_3_F_CHEM 1211L',
 'CORE_COURSE_NAME_3_F_COMM 1110',
 'CORE_COURSE_NAME_3_F_CPSC 1301L',
 'CORE_COURSE_NAME_3_F_ECON 2106',
 'CORE_COURSE_NAME_3_F_ENGL 1101',
 'CORE_COURSE_NAME_3_F_ENGL 1101I',
 'CORE_COURSE_NAME_3_F_ENGL 1102',
 'CORE_COURSE_NAME_3_F_HIST 1112',
 'CORE_COURSE_NAME_3_F_HIST 2111',
 'CORE_COURSE_NAME_3_F_HIST 2112',
 'CORE_COURSE_NAME_3_F_ITDS 1145',
 'CORE_COURSE_NAME_3_F_ITDS 2735',
 'CORE_COURSE_NAME_3_F_ITDS 2749',
 'CORE_COURSE_NAME_3_F_LEAD 1705',
 'CORE_COURSE_NAME_3_F_MATH 1101',
 'CORE_COURSE_NAME_3_F_MATH 1111',
 'CORE_COURSE_NAME_3_F_MATH 1113',
 'CORE_COURSE_NAME_3_F_MUSC 1100',
 'CORE_COURSE_NAME_3_F_POLS 1101',
 'CORE_COURSE_NAME_3_F_PSYC 1101',
 'CORE_COURSE_NAME_3_F_SOCI 1101',
 'CORE_COURSE_NAME_3_F_SPAN 1001',
 'CORE_COURSE_NAME_3_F_SPAN 1002',
 'CORE_COURSE_NAME_3_F_THEA 1100',
 'CORE_COURSE_GRADE_3_F_A',
 'CORE_COURSE_GRADE_3_F_B',
 'CORE_COURSE_GRADE_3_F_C',
 'CORE_COURSE_GRADE_3_F_D',
 'CORE_COURSE_GRADE_3_F_F',
 'CORE_COURSE_NAME_1_S_ANTH 1105',
 'CORE_COURSE_NAME_1_S_ARTH 1100',
 'CORE_COURSE_NAME_1_S_BIOL 1125',
 'CORE_COURSE_NAME_1_S_BIOL 1215K',
 'CORE_COURSE_NAME_1_S_CHEM 1151',
 'CORE_COURSE_NAME_1_S_CHEM 1211',
 'CORE_COURSE_NAME_1_S_COMM 1110',
 'CORE_COURSE_NAME_1_S_CPSC 1105',
 'CORE_COURSE_NAME_1_S_CPSC 1301',
 'CORE_COURSE_NAME_1_S_ECON 2105',
 'CORE_COURSE_NAME_1_S_ECON 2106',
 'CORE_COURSE_NAME_1_S_ENGL 1101',
 'CORE_COURSE_NAME_1_S_ENGL 1102',
 'CORE_COURSE_NAME_1_S_ENVS 1105',
 'CORE_COURSE_NAME_1_S_GEOL 1110',
 'CORE_COURSE_NAME_1_S_HIST 1111',
 'CORE_COURSE_NAME_1_S_HIST 1112',
 'CORE_COURSE_NAME_1_S_HIST 2111',
 'CORE_COURSE_NAME_1_S_HIST 2112',
 'CORE_COURSE_NAME_1_S_MATH 1101',
 'CORE_COURSE_NAME_1_S_MATH 1111',
 'CORE_COURSE_NAME_1_S_MATH 1113',
 'CORE_COURSE_NAME_1_S_MUSC 1100',
 'CORE_COURSE_NAME_1_S_POLS 1101',
 'CORE_COURSE_NAME_1_S_PSYC 1101',
 'CORE_COURSE_NAME_1_S_SPAN 1002',
 'CORE_COURSE_GRADE_1_S_A',
 'CORE_COURSE_GRADE_1_S_B',
 'CORE_COURSE_GRADE_1_S_C',
 'CORE_COURSE_GRADE_1_S_D',
 'CORE_COURSE_GRADE_1_S_F',
 'CORE_COURSE_GRADE_1_S_NOT REP',
 'CORE_COURSE_GRADE_1_S_Unknown',
 'CORE_COURSE_NAME_2_S_ANTH 1105',
 'CORE_COURSE_NAME_2_S_ARTH 1100',
 'CORE_COURSE_NAME_2_S_BIOL 1215K',
 'CORE_COURSE_NAME_2_S_CHEM 1151',
 'CORE_COURSE_NAME_2_S_CHEM 1151L',
 'CORE_COURSE_NAME_2_S_CHEM 1211',
 'CORE_COURSE_NAME_2_S_COMM 1110',
 'CORE_COURSE_NAME_2_S_ECON 2105',
 'CORE_COURSE_NAME_2_S_ENGL 1101',
 'CORE_COURSE_NAME_2_S_ENGL 1102',
 'CORE_COURSE_NAME_2_S_ENVS 1105',
 'CORE_COURSE_NAME_2_S_FREN 1001',
 'CORE_COURSE_NAME_2_S_HIST 1111',
 'CORE_COURSE_NAME_2_S_HIST 1112',
 'CORE_COURSE_NAME_2_S_HIST 2111',
 'CORE_COURSE_NAME_2_S_HIST 2112',
 'CORE_COURSE_NAME_2_S_ITDS 2735',
 'CORE_COURSE_NAME_2_S_MATH 1101',
 'CORE_COURSE_NAME_2_S_MATH 1111',
 'CORE_COURSE_NAME_2_S_MATH 1113',
 'CORE_COURSE_NAME_2_S_MATH 1125',
 'CORE_COURSE_NAME_2_S_MUSC 1100',
 'CORE_COURSE_NAME_2_S_POLS 1101',
 'CORE_COURSE_NAME_2_S_PSYC 1101',
 'CORE_COURSE_NAME_2_S_SOCI 1101',
 'CORE_COURSE_NAME_2_S_STAT 1127',
 'CORE_COURSE_NAME_2_S_THEA 1100',
 'CORE_COURSE_GRADE_2_S_A',
 'CORE_COURSE_GRADE_2_S_B',
 'CORE_COURSE_GRADE_2_S_C',
 'CORE_COURSE_GRADE_2_S_D',
 'CORE_COURSE_GRADE_2_S_F',
 'HOUSING_STS_Off Campus',
 'HOUSING_STS_On Campus']
In [95]:
pd.set_option('display.max_rows', None)
# put all selection together
feature_selection_df = pd.DataFrame({'Feature':x.columns.tolist(), 'Pearson':cor_support, 'Chi-2':chi_support, 'RFE':rfe_support, 'Logistics':embeded_lr_support,
                                    'Random Forest':embeded_rf_support,})
# count the selected times for each feature
feature_selection_df['Total'] = np.sum(feature_selection_df, axis=1)
# display the top 100
feature_selection_df = feature_selection_df.sort_values(['Total','Feature'] , ascending=False)
feature_selection_df.index = range(1, len(feature_selection_df)+1)
feature_selection_df.head(100)
Out[95]:
Feature Pearson Chi-2 RFE Logistics Random Forest Total
1 STDNT_MINOR_English Language/Literature True True True True True 5
2 STDNT_MAJOR_Theatre Arts True True True True True 5
3 STDNT_MAJOR_Music Performance True True True True True 5
4 STDNT_MAJOR_Music Education True True True True True 5
5 STDNT_MAJOR_Biology and Secondary Ed True True True True True 5
6 CORE_COURSE_NAME_3_F_SPAN 1001 True True True True True 5
7 CORE_COURSE_NAME_3_F_ITDS 2735 True True True True True 5
8 CORE_COURSE_NAME_3_F_ENGL 1102 True True True True True 5
9 CORE_COURSE_NAME_3_F_ENGL 1101I True True True True True 5
10 CORE_COURSE_NAME_3_F_ECON 2106 True True True True True 5
11 CORE_COURSE_NAME_3_F_COMM 1110 True True True True True 5
12 CORE_COURSE_NAME_3_F_BIOL 1215K True True True True True 5
13 CORE_COURSE_NAME_3_F_ANTH 1105 True True True True True 5
14 CORE_COURSE_NAME_2_S_MATH 1125 True True True True True 5
15 CORE_COURSE_NAME_2_S_ENGL 1102 True True True True True 5
16 CORE_COURSE_NAME_2_F_POLS 1101 True True True True True 5
17 CORE_COURSE_NAME_1_S_HIST 1112 True True True True True 5
18 CORE_COURSE_NAME_1_S_HIST 1111 True True True True True 5
19 CORE_COURSE_NAME_1_S_ENGL 1102 True True True True True 5
20 CORE_COURSE_NAME_1_F_ECON 2106 True True True True True 5
21 CORE_COURSE_GRADE_3_F_F True True True True True 5
22 CORE_COURSE_GRADE_2_S_F True True True True True 5
23 CORE_COURSE_GRADE_2_S_C True True True True True 5
24 CORE_COURSE_GRADE_2_F_F True True True True True 5
25 CORE_COURSE_GRADE_1_S_Unknown True True True True True 5
26 CORE_COURSE_GRADE_1_S_B True True True True True 5
27 CORE_COURSE_GRADE_1_S_A True True True True True 5
28 CORE_COURSE_GRADE_1_F_F True True True True True 5
29 CORE_COURSE_GRADE_1_F_B True True True True True 5
30 STDNT_MINOR_Women's Studies True True True True False 4
31 STDNT_MAJOR_Theatre Education True True True True False 4
32 STDNT_MAJOR_Psychology True True False True True 4
33 STDNT_MAJOR_Pre-Nursing True True False True True 4
34 STDNT_MAJOR_Music True True False True True 4
35 STDNT_MAJOR_Joint Enrollment - Accel True True True True False 4
36 STDNT_MAJOR_Health Science True True True False True 4
37 STDNT_MAJOR_Finance True True False True True 4
38 STDNT_MAJOR_Criminal Justice True True False True True 4
39 STDNT_MAJOR_Computer Science - Games True True False True True 4
40 STDNT_BACKGROUND True False True True True 4
41 INTERNATIONAL_STS_Y True True False True True 4
42 INTERNATIONAL_STS_N True False True True True 4
43 HOUSING_STS_Off Campus True True False True True 4
44 FIRST_TERM_EARNED_HRS True False True True True 4
45 DISTANCE_FROM_HOME True False True True True 4
46 CORE_COURSE_NAME_3_F_THEA 1100I True True True True False 4
47 CORE_COURSE_NAME_3_F_THEA 1100 True True False True True 4
48 CORE_COURSE_NAME_3_F_LATN 1002 True True True True False 4
49 CORE_COURSE_NAME_3_F_ITDS 1145 False True True True True 4
50 CORE_COURSE_NAME_3_F_ENGL 2136 True True True True False 4
51 CORE_COURSE_NAME_2_S_SPAN 1001 True True True True False 4
52 CORE_COURSE_NAME_2_S_MATH 2125 True True True True False 4
53 CORE_COURSE_NAME_2_S_MATH 1131 True True True True False 4
54 CORE_COURSE_NAME_2_S_ITDS 2735 True True False True True 4
55 CORE_COURSE_NAME_2_S_ECON 2106 True True True True False 4
56 CORE_COURSE_NAME_2_S_CPSC 1105 True True True True False 4
57 CORE_COURSE_NAME_2_F_PHYS 2311 True True True True False 4
58 CORE_COURSE_NAME_2_F_MATH 2125 True True True True False 4
59 CORE_COURSE_NAME_2_F_MATH 1113 True True True True False 4
60 CORE_COURSE_NAME_2_F_ENVS 1105I True True True True False 4
61 CORE_COURSE_NAME_2_F_ASTR 1105 True True True True False 4
62 CORE_COURSE_NAME_1_S_SPAN 1001 True True True True False 4
63 CORE_COURSE_NAME_1_S_MATH 1111 True True False True True 4
64 CORE_COURSE_NAME_1_S_CPSC 1301L True True True True False 4
65 CORE_COURSE_NAME_1_S_COMM 1110 True True False True True 4
66 CORE_COURSE_NAME_1_S_BIOL 1215K True True False True True 4
67 CORE_COURSE_NAME_1_F_GEOL 1121 True True True True False 4
68 CORE_COURSE_NAME_1_F_ENGL 2111 True True True True False 4
69 CORE_COURSE_NAME_1_F_ANTH 1105I True True True True False 4
70 CORE_COURSE_GRADE_3_F_C True True False True True 4
71 CORE_COURSE_GRADE_3_F_A True True False True True 4
72 CORE_COURSE_GRADE_2_S_B True True False True True 4
73 CORE_COURSE_GRADE_2_S_A True True False True True 4
74 STDNT_TEST_ENTRANCE_COMB False False True True True 3
75 STDNT_MINOR_Mathematics False True True True False 3
76 STDNT_MAJOR_Undeclared True True False False True 3
77 STDNT_MAJOR_General Business True True False False True 3
78 STDNT_GENDER_M True False False True True 3
79 SECOND_TERM_EARNED_HRS True True False False True 3
80 IN_STATE_FLAG_N True True False False True 3
81 HOUSING_STS_On Campus True True False False True 3
82 FATHER_HI_EDU_CD True False False True True 3
83 CORE_COURSE_NAME_3_F_SPAN 1002 False False True True True 3
84 CORE_COURSE_NAME_3_F_PHYS 2211 True True False True False 3
85 CORE_COURSE_NAME_3_F_CHEM 1211 False True True True False 3
86 CORE_COURSE_NAME_2_S_POLS 1101 True True False False True 3
87 CORE_COURSE_NAME_2_S_ITDS 2726 False True True True False 3
88 CORE_COURSE_NAME_2_S_COMM 1110 True True False False True 3
89 CORE_COURSE_NAME_2_S_CHEM 1211 False False True True True 3
90 CORE_COURSE_NAME_2_F_ITDS 1155 False True True True False 3
91 CORE_COURSE_NAME_2_F_FREN 1001 False False True True True 3
92 CORE_COURSE_NAME_2_F_CHEM 1151L True True False False True 3
93 CORE_COURSE_NAME_1_S_MUSC 1100 True True False False True 3
94 CORE_COURSE_NAME_1_S_MATH 1113 False False True True True 3
95 CORE_COURSE_NAME_1_S_HIST 2112 True True False False True 3
96 CORE_COURSE_NAME_1_S_ENVS 1105 False False True True True 3
97 CORE_COURSE_NAME_1_S_CHEM 1211 False False True True True 3
98 CORE_COURSE_NAME_1_S_BIOL 1225K False True True True False 3
99 CORE_COURSE_NAME_1_F_SPAN 1001 False False True True True 3
100 CORE_COURSE_NAME_1_F_MATH 1132 False True True True False 3
In [96]:
#We can also use this for feature selection but it will take more time for printing Accuracy
#for index in range(1,493):
#    sel=RFE(GradientBoostingClassifier(n_estimators=100,random_state=0),n_features_to_select=index)
#    sel.fit(x_train,y_train)
#    x_train_rfe=sel.transform(x_train)
#    x_test_rfe=sel.transform(x_test)
#    print('Selected Feature:',index)
#   run_randomForest(x_train_rfe,x_test_rfe,y_train,y_test)
#   print()

Model Building

In [97]:
count_no_left = len(df3[df3['RETURNED_2ND_YR']==1])
count_join = len(df3[df3['RETURNED_2ND_YR']==0])
pct_of_no_left = count_no_left/(count_no_left+count_join)
print("percentage of no of left is", pct_of_no_left*100)
pct_of_join = count_join/(count_no_left+count_join)
print("percentage of join", pct_of_join*100)
percentage of no of left is 21.264705882352942
percentage of join 78.73529411764706
In [98]:
#Our classes are imbalanced, and the ratio of join and left students is 79:21. 
In [99]:
df4=df.copy()
In [100]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
# Create a label encoder object--to avoid duplicate
le = LabelEncoder()
In [101]:
# Label Encoding will be used for columns with 2 or less unique values
le_count = 0
for col in df4.columns[1:]:
    if df4[col].dtype == 'object':
        if len(list(df4[col].unique())) <= 2:
            le.fit(df4[col])
            df4[col] = le.transform(df4[col])
            le_count += 1
print('{} columns were label encoded.'.format(le_count))
4 columns were label encoded.
In [102]:
df4.head()
Out[102]:
STDNT_AGE STDNT_GENDER STDNT_BACKGROUND IN_STATE_FLAG INTERNATIONAL_STS STDNT_MAJOR STDNT_MINOR STDNT_TEST_ENTRANCE_COMB CORE_COURSE_NAME_1_F CORE_COURSE_GRADE_1_F ... RETURNED_2ND_YR DISTANCE_FROM_HOME HIGH_SCHL_GPA FATHER_HI_EDU_CD MOTHER_HI_EDU_CD DEGREE_GROUP_CD FIRST_TERM_EARNED_HRS SECOND_TERM_EARNED_HRS COST_OF_ATTEND EST_FAM_CONTRIBUTION
0 18 0 1 1 0 Undeclared N 1150.0 ANTH 1105 A ... 1 150.0 4.00 2 3.0 B 16 14.0 0 0
1 19 0 1 0 0 Undeclared N 1190.0 ANTH 1105 A ... 0 69.0 2.89 3 3.0 B 18 18.0 1355760 785760
2 18 1 1 1 0 Mathematics N 1030.0 ANTH 1105 A ... 0 150.0 3.39 2 4.0 B 15 14.0 0 0
3 18 1 1 1 0 Undeclared N 1220.0 ANTH 1107 A ... 0 150.0 2.93 3 3.0 B 13 14.0 0 0
4 18 0 1 1 0 Art N 1190.0 ANTH 1107 A ... 0 69.0 3.86 3 2.0 B 12 12.0 1355760 519840

5 rows × 29 columns

In [103]:
df4.columns
Out[103]:
Index(['STDNT_AGE', 'STDNT_GENDER', 'STDNT_BACKGROUND', 'IN_STATE_FLAG',
       'INTERNATIONAL_STS', 'STDNT_MAJOR', 'STDNT_MINOR',
       'STDNT_TEST_ENTRANCE_COMB', 'CORE_COURSE_NAME_1_F',
       'CORE_COURSE_GRADE_1_F', 'CORE_COURSE_NAME_2_F',
       'CORE_COURSE_GRADE_2_F', 'CORE_COURSE_NAME_3_F',
       'CORE_COURSE_GRADE_3_F', 'CORE_COURSE_NAME_1_S',
       'CORE_COURSE_GRADE_1_S', 'CORE_COURSE_NAME_2_S',
       'CORE_COURSE_GRADE_2_S', 'HOUSING_STS', 'RETURNED_2ND_YR',
       'DISTANCE_FROM_HOME', 'HIGH_SCHL_GPA', 'FATHER_HI_EDU_CD',
       'MOTHER_HI_EDU_CD', 'DEGREE_GROUP_CD', 'FIRST_TERM_EARNED_HRS',
       'SECOND_TERM_EARNED_HRS', 'COST_OF_ATTEND', 'EST_FAM_CONTRIBUTION'],
      dtype='object')
In [104]:
df3.head()
Out[104]:
STDNT_AGE STDNT_GENDER STDNT_BACKGROUND IN_STATE_FLAG INTERNATIONAL_STS STDNT_MAJOR STDNT_MINOR STDNT_TEST_ENTRANCE_COMB CORE_COURSE_NAME_1_F CORE_COURSE_GRADE_1_F ... RETURNED_2ND_YR DISTANCE_FROM_HOME HIGH_SCHL_GPA FATHER_HI_EDU_CD MOTHER_HI_EDU_CD DEGREE_GROUP_CD FIRST_TERM_EARNED_HRS SECOND_TERM_EARNED_HRS COST_OF_ATTEND EST_FAM_CONTRIBUTION
0 18 F 1 Y N Undeclared N 1150.0 ANTH 1105 A ... 1 150.0 4.00 2 3.0 B 16 14.0 0 0
1 19 F 1 N N Undeclared N 1190.0 ANTH 1105 A ... 0 69.0 2.89 3 3.0 B 18 18.0 1355760 785760
2 18 M 1 Y N Mathematics N 1030.0 ANTH 1105 A ... 0 150.0 3.39 2 4.0 B 15 14.0 0 0
3 18 M 1 Y N Undeclared N 1220.0 ANTH 1107 A ... 0 150.0 2.93 3 3.0 B 13 14.0 0 0
4 18 F 1 Y N Art N 1190.0 ANTH 1107 A ... 0 69.0 3.86 3 2.0 B 12 12.0 1355760 519840

5 rows × 29 columns

In [105]:
X=df4.drop(["RETURNED_2ND_YR"],axis=1)
Y=df4['RETURNED_2ND_YR']
X=pd.get_dummies(X)
In [106]:
import sklearn.model_selection as model_selection
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
In [107]:
# Split-out validation dataset
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, test_size=0.20, random_state=0)
In [108]:
# Spot Check Algorithms
models = []
models.append(('LR', LogisticRegression(solver='liblinear', multi_class='ovr')))
models.append(('RF', RandomForestClassifier()))
models.append(('xgboost', XGBClassifier(random_state=7)))
models.append(('DT',DecisionTreeClassifier()))
# evaluate each model in turn
results = []
names = []
for name, model in models:
    kfold = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)
    cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='accuracy')
    results.append(cv_results)
    names.append(name)
    print('%s: %f (%f)' % (name, cv_results.mean(), cv_results.std()))
LR: 0.780891 (0.003134)
RF: 0.812151 (0.013417)
xgboost: 0.815084 (0.015124)
DT: 0.736024 (0.013044)
In [109]:
auc_results = []
names = []
for name, model in models:
    kfold = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)
    cv_auc_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='roc_auc')
    auc_results.append(cv_auc_results)
       # auc_results.append(cv_auc_results)

    names.append(name)
    print('%s: %f (%f)' % (name, cv_auc_results.mean(), cv_auc_results.std()))
LR: 0.513028 (0.030372)
RF: 0.676057 (0.037526)
xgboost: 0.700603 (0.033126)
DT: 0.618072 (0.028705)
In [110]:
import matplotlib.pyplot as plt
# Compare Algorithms
fig = plt.figure(figsize=(15, 7))
plt.boxplot(results, labels=names)
plt.title('Algorithm Comparison')
plt.show()
In [111]:
print("Number transactions X_train dataset: ", X_train.shape)
print("Number transactions Y_train dataset: ", Y_train.shape)
print("Number transactions X_test dataset: ", X_test.shape)
print("Number transactions Y_test dataset: ", Y_test.shape)
Number transactions X_train dataset:  (2720, 492)
Number transactions Y_train dataset:  (2720,)
Number transactions X_test dataset:  (680, 492)
Number transactions Y_test dataset:  (680,)

LogisticRegression

In [112]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
logreg = LogisticRegression()
logreg.fit(X_train, Y_train)
Out[112]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
In [113]:
#Accuracy
logreg.score(X_test,Y_test)
Out[113]:
0.8088235294117647
In [114]:
Y_pred = logreg.predict(X_test)
In [115]:
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, Y_test)))
Accuracy of logistic regression classifier on test set: 0.81
In [116]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(Y_test, Y_pred)
print(confusion_matrix)
[[550   1]
 [129   0]]

The result is telling us that we have 550+0 correct predictions and 129+1 incorrect predictions.

In [117]:
from sklearn.metrics import classification_report
print(classification_report(Y_test, Y_pred))
             precision    recall  f1-score   support

          0       0.81      1.00      0.89       551
          1       0.00      0.00      0.00       129

avg / total       0.66      0.81      0.72       680

The precision is the ratio tp / (tp + fp) where tp is the number of true positives and fp the number of false positives. The precision is intuitively the ability of the classifier to not label a sample as positive if it is negative. The recall is the ratio tp / (tp + fn) where tp is the number of true positives and fn the number of false negatives. The recall is intuitively the ability of the classifier to find all the positive samples. The F-beta score can be interpreted as a weighted harmonic mean of the precision and recall, where an F-beta score reaches its best value at 1 and worst score at 0. The F-beta score weights the recall more than the precision by a factor of beta. beta = 1.0 means recall and precision are equally important. The support is the number of occurrences of each class in y_test.

In [118]:
# precision-recall curve and f1
from sklearn.datasets import make_classification
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score
from sklearn.metrics import auc
from matplotlib import pyplot
In [119]:
# generate 2 class dataset
#X, Y = make_classification(n_samples=1000, n_classes=2, random_state=0)
In [120]:
# predict probabilities
lr_probs = logreg.predict_proba(X_test)
In [121]:
# keep probabilities for the positive outcome only
lr_probs = lr_probs[:, 1]
In [122]:
# predict class values
yhat = logreg.predict(X_test)
lr_precision, lr_recall, _ = precision_recall_curve(Y_test, lr_probs)
lr_f1, lr_auc = f1_score(Y_test, yhat), auc(lr_recall, lr_precision)
In [123]:
# summarize scores
print('Logistic: f1=%.3f auc=%.3f' % (lr_f1, lr_auc))
Logistic: f1=0.000 auc=0.182
In [124]:
# plot the precision-recall curves
no_skill = len(Y_test[Y_test==1]) / len(Y_test)
pyplot.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill')
pyplot.plot(lr_recall, lr_precision, marker='.', label='Logistic')
# axis labels
pyplot.xlabel('Recall')
pyplot.ylabel('Precision')
# show the legend
pyplot.legend()
# show the plot
pyplot.show()
In [125]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
logit_roc_auc = roc_auc_score(Y_test, logreg.predict(X_test))
fpr, tpr, thresholds = roc_curve(Y_test, logreg.predict_proba(X_test)[:,1])
plt.figure(figsize=(14, 6))
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()

Random Forest

In [126]:
from sklearn.ensemble import RandomForestClassifier
clf=RandomForestClassifier(n_estimators=210,oob_score=True,n_jobs=-1,random_state=400)
clf.fit(X_train,Y_train)
Out[126]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=210, n_jobs=-1,
            oob_score=True, random_state=400, verbose=0, warm_start=False)
In [127]:
print(clf.oob_score_)
print(clf.score(X_train,Y_train))
0.8213235294117647
1.0
In [128]:
clf.score(X_test,Y_test)
Out[128]:
0.836764705882353
In [129]:
Y_pred = clf.predict(X_test)
In [130]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(Y_test, Y_pred)
print(confusion_matrix)
[[544   7]
 [104  25]]

The result is telling us that we have 544+25 correct predictions and 104+7 incorrect predictions.

In [131]:
from sklearn.metrics import classification_report
print(classification_report(Y_test, Y_pred))
             precision    recall  f1-score   support

          0       0.84      0.99      0.91       551
          1       0.78      0.19      0.31       129

avg / total       0.83      0.84      0.79       680

In [132]:
# precision-recall curve and f1
from sklearn.datasets import make_classification
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score
from sklearn.metrics import auc
from matplotlib import pyplot
In [133]:
# generate 2 class dataset
#X, Y = make_classification(n_samples=1000, n_classes=2, random_state=0)
In [134]:
# predict probabilities
rf_probs = clf.predict_proba(X_test)
In [135]:
# keep probabilities for the positive outcome only
rf_probs = rf_probs[:, 1]
In [136]:
# predict class values
yhat = clf.predict(X_test)
rf_precision, rf_recall, _ = precision_recall_curve(Y_test, rf_probs)
rf_f1, rf_auc = f1_score(Y_test, yhat), auc(rf_recall, rf_precision)
In [137]:
# summarize scores
print('RandomForest: f1=%.3f auc=%.3f' % (rf_f1, rf_auc))
RandomForest: f1=0.311 auc=0.469
In [138]:
# plot the precision-recall curves
no_skill = len(Y_test[Y_test==1]) / len(Y_test)
pyplot.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill')
pyplot.plot(lr_recall, lr_precision, marker='.', label='RandomForest')
# axis labels
pyplot.xlabel('Recall')
pyplot.ylabel('Precision')
# show the legend
pyplot.legend()
# show the plot
pyplot.show()
In [139]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
clf_roc_auc = roc_auc_score(Y_test, clf.predict(X_test))
fpr, tpr, thresholds = roc_curve(Y_test, clf.predict_proba(X_test)[:,1])
plt.figure(figsize=(14, 6))
plt.plot(fpr, tpr, label='Random Forest (area = %0.2f)' % clf_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('RF_ROC')
plt.show()

GridSearchCV

In [141]:
from sklearn.ensemble import GradientBoostingClassifier
gb_clf=GradientBoostingClassifier(n_estimators=80,random_state=400)
from sklearn.model_selection import GridSearchCV
mod=GridSearchCV(gb_clf,param_grid={'n_estimators':[80,180,200,220,240,260,280,300]})
mod.fit(X_train,Y_train)
mod.best_estimator_
Out[141]:
GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=80,
              presort='auto', random_state=400, subsample=1.0, verbose=0,
              warm_start=False)

XGBClassifier

In [142]:
from xgboost import XGBClassifier
xgb_clf = XGBClassifier(n_estimators=80,random_state=400)
xgb_clf.fit(X_train, Y_train)
score = xgb_clf.score(X_test,Y_test)
print(score)
0.8323529411764706
In [143]:
Y_pred = xgb_clf.predict(X_test)
In [144]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(Y_test, Y_pred)
print(confusion_matrix)
[[522  29]
 [ 85  44]]

The result is telling us that we have 522+44 correct predictions and 85+29 incorrect predictions.

In [145]:
from sklearn.metrics import classification_report
print(classification_report(Y_test, Y_pred))
             precision    recall  f1-score   support

          0       0.86      0.95      0.90       551
          1       0.60      0.34      0.44       129

avg / total       0.81      0.83      0.81       680

In [146]:
# precision-recall curve and f1
from sklearn.datasets import make_classification
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score
from sklearn.metrics import auc
from matplotlib import pyplot
In [147]:
# generate 2 class dataset
#X, Y = make_classification(n_samples=1000, n_classes=2, random_state=0)
In [148]:
# predict probabilities
xgb_probs =xgb_clf.predict_proba(X_test)
In [149]:
# keep probabilities for the positive outcome only
xgb_probs = xgb_probs[:, 1]
In [150]:
# predict class values
yhat = xgb_clf.predict(X_test)
xgb_precision, xgb_recall, _ = precision_recall_curve(Y_test, xgb_probs)
xgb_f1, xgb_auc = f1_score(Y_test, yhat), auc(xgb_recall, xgb_precision)
In [151]:
# summarize scores
print('Xgboost: f1=%.3f auc=%.3f' % (xgb_f1, xgb_auc))
Xgboost: f1=0.436 auc=0.478
In [152]:
# plot the precision-recall curves
no_skill = len(Y_test[Y_test==1]) / len(Y_test)
pyplot.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill')
pyplot.plot(xgb_recall, xgb_precision, marker='.', label='XGBoost')
# axis labels
pyplot.xlabel('Recall')
pyplot.ylabel('Precision')
# show the legend
pyplot.legend()
# show the plot
pyplot.show()
In [153]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
xgb_clf_roc_auc = roc_auc_score(Y_test, xgb_clf.predict(X_test))
fpr, tpr, thresholds = roc_curve(Y_test, xgb_clf.predict_proba(X_test)[:,1])
plt.figure(figsize=(14, 6))
plt.plot(fpr, tpr, label='XGBoost (area = %0.2f)' % xgb_clf_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('xgb_ROC')
plt.show()
In [154]:
mod1=GridSearchCV(xgb_clf,param_grid={'n_estimators':[80,100,120,140,160,180,200,220,240,260,280,300]})
mod1.fit(X_train,Y_train)
mod1.best_estimator_
Out[154]:
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
       importance_type='gain', interaction_constraints='',
       learning_rate=0.300000012, max_delta_step=0, max_depth=6,
       min_child_weight=1, missing=nan, monotone_constraints='()',
       n_estimators=80, n_jobs=0, num_parallel_tree=1,
       objective='binary:logistic', random_state=400, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
       validate_parameters=1, verbosity=None)
In [155]:
xgb_feat=pd.Series(xgb_clf.feature_importances_,index=X_train.columns).sort_values(ascending=False).reset_index()
In [156]:
xgb_feat1=xgb_feat[xgb_feat[0]>0]
In [157]:
xgb_feat1
Out[157]:
index 0
0 CORE_COURSE_GRADE_2_S_F 0.028109
1 SECOND_TERM_EARNED_HRS 0.019571
2 CORE_COURSE_GRADE_1_S_A 0.013735
3 CORE_COURSE_NAME_2_F_HIST 2112 0.013497
4 CORE_COURSE_GRADE_1_S_B 0.012586
5 CORE_COURSE_NAME_3_F_MATH 1113 0.011627
6 CORE_COURSE_NAME_1_S_HIST 1112 0.011404
7 CORE_COURSE_GRADE_1_F_D 0.011067
8 CORE_COURSE_NAME_1_F_THEA 1100 0.011006
9 CORE_COURSE_GRADE_2_F_F 0.010906
10 CORE_COURSE_GRADE_2_S_D 0.010534
11 CORE_COURSE_NAME_1_F_HIST 1111 0.010379
12 CORE_COURSE_NAME_1_F_CHEM 1151 0.010145
13 CORE_COURSE_NAME_2_S_MATH 1111 0.010079
14 CORE_COURSE_NAME_2_F_ECON 2106 0.009847
15 STDNT_MAJOR_Pre-Business 0.009658
16 CORE_COURSE_NAME_3_F_ENGL 1102 0.009620
17 CORE_COURSE_NAME_3_F_CPSC 1301L 0.009578
18 CORE_COURSE_NAME_1_S_ENVS 1105 0.009392
19 CORE_COURSE_GRADE_1_S_Unknown 0.009376
20 STDNT_MAJOR_Criminal Justice 0.008990
21 CORE_COURSE_NAME_2_S_CHEM 1211 0.008856
22 CORE_COURSE_GRADE_1_F_NOT REP 0.008772
23 STDNT_MAJOR_Nursing 0.008695
24 CORE_COURSE_NAME_2_S_COMM 1110 0.008494
25 CORE_COURSE_NAME_1_F_HIST 2112 0.008451
26 CORE_COURSE_NAME_3_F_MUSC 1100 0.008424
27 CORE_COURSE_GRADE_1_S_F 0.008301
28 CORE_COURSE_NAME_1_F_HIST 2111 0.008106
29 CORE_COURSE_GRADE_1_F_F 0.007990
30 CORE_COURSE_NAME_2_S_ENGL 1102 0.007844
31 STDNT_MAJOR_Exercise Science 0.007761
32 CORE_COURSE_NAME_3_F_CHEM 1151 0.007734
33 CORE_COURSE_NAME_1_F_CHEM 1211 0.007733
34 CORE_COURSE_NAME_2_F_POLS 1101 0.007681
35 CORE_COURSE_NAME_3_F_CHEM 1151L 0.007663
36 STDNT_GENDER 0.007559
37 CORE_COURSE_NAME_1_F_COMM 1110 0.007549
38 CORE_COURSE_NAME_3_F_ECON 2106 0.007484
39 CORE_COURSE_NAME_1_F_PSYC 1101 0.007477
40 CORE_COURSE_GRADE_3_F_C 0.007472
41 CORE_COURSE_NAME_3_F_THEA 1100 0.007458
42 STDNT_MINOR_N 0.007404
43 STDNT_MAJOR_Management 0.007383
44 CORE_COURSE_NAME_1_S_PSYC 1101 0.007305
45 DISTANCE_FROM_HOME 0.007232
46 INTERNATIONAL_STS 0.007088
47 CORE_COURSE_NAME_3_F_ENVS 1105 0.007045
48 IN_STATE_FLAG 0.006984
49 CORE_COURSE_NAME_3_F_POLS 1101 0.006849
50 CORE_COURSE_GRADE_3_F_F 0.006799
51 STDNT_MAJOR_Pre-Nursing 0.006779
52 CORE_COURSE_GRADE_2_F_C 0.006766
53 FIRST_TERM_EARNED_HRS 0.006753
54 CORE_COURSE_NAME_2_F_MUSC 1100 0.006619
55 CORE_COURSE_NAME_3_F_ARTH 1100 0.006598
56 CORE_COURSE_NAME_1_S_HIST 2111 0.006446
57 CORE_COURSE_NAME_2_S_HIST 2112 0.006424
58 STDNT_MAJOR_Undeclared 0.006384
59 CORE_COURSE_NAME_2_F_HIST 2111 0.006375
60 CORE_COURSE_NAME_1_S_ECON 2105 0.006296
61 CORE_COURSE_NAME_2_S_HIST 2111 0.006295
62 CORE_COURSE_NAME_3_F_HIST 2111 0.006268
63 HOUSING_STS 0.006261
64 CORE_COURSE_NAME_3_F_ITDS 2735 0.006211
65 CORE_COURSE_NAME_3_F_CHEM 1211L 0.006176
66 CORE_COURSE_NAME_1_S_ENGL 1102 0.006161
67 CORE_COURSE_NAME_2_F_MATH 1111 0.006145
68 CORE_COURSE_NAME_2_S_ECON 2105 0.006102
69 STDNT_MAJOR_Theatre Arts 0.006079
70 CORE_COURSE_NAME_1_F_ECON 2106 0.005896
71 CORE_COURSE_NAME_1_F_ARTH 1100 0.005876
72 CORE_COURSE_GRADE_1_S_D 0.005869
73 CORE_COURSE_NAME_3_F_COMM 1110 0.005858
74 STDNT_MAJOR_English Language/Literature 0.005731
75 CORE_COURSE_NAME_1_F_MATH 1111 0.005679
76 STDNT_MAJOR_Joint Enrollment - Accel 0.005674
77 CORE_COURSE_NAME_2_S_PSYC 1101 0.005625
78 CORE_COURSE_NAME_1_S_COMM 1110 0.005582
79 CORE_COURSE_NAME_2_S_ITDS 2735 0.005565
80 CORE_COURSE_NAME_2_F_ECON 2105 0.005553
81 STDNT_TEST_ENTRANCE_COMB 0.005258
82 CORE_COURSE_NAME_3_F_MATH 1111 0.005253
83 CORE_COURSE_NAME_2_F_COMM 1110 0.005245
84 STDNT_MAJOR_Music Education 0.005242
85 CORE_COURSE_GRADE_1_F_B 0.005225
86 CORE_COURSE_NAME_2_S_MATH 1125 0.005145
87 CORE_COURSE_GRADE_2_F_D 0.005138
88 STDNT_AGE 0.005078
89 CORE_COURSE_GRADE_3_F_A 0.005056
90 CORE_COURSE_NAME_3_F_PSYC 1101 0.005055
91 STDNT_BACKGROUND 0.005053
92 COST_OF_ATTEND 0.004965
93 CORE_COURSE_GRADE_3_F_B 0.004952
94 CORE_COURSE_NAME_3_F_BIOL 1215K 0.004930
95 CORE_COURSE_NAME_2_S_SOCI 1101 0.004920
96 CORE_COURSE_GRADE_1_S_C 0.004868
97 CORE_COURSE_NAME_2_S_MUSC 1100 0.004813
98 FATHER_HI_EDU_CD 0.004813
99 CORE_COURSE_GRADE_2_F_B 0.004794
100 STDNT_MAJOR_Art 0.004779
101 CORE_COURSE_NAME_1_S_MATH 1111 0.004715
102 CORE_COURSE_NAME_1_S_BIOL 1125 0.004614
103 EST_FAM_CONTRIBUTION 0.004582
104 STDNT_MAJOR_Music Performance 0.004548
105 STDNT_MAJOR_Health and Physical Education 0.004525
106 CORE_COURSE_NAME_2_F_MATH 1113 0.004517
107 CORE_COURSE_GRADE_2_S_A 0.004455
108 MOTHER_HI_EDU_CD 0.004428
109 CORE_COURSE_GRADE_2_S_C 0.004395
110 CORE_COURSE_NAME_2_F_CHEM 1151 0.004356
111 STDNT_MAJOR_Biology 0.004332
112 CORE_COURSE_NAME_3_F_ENGL 1101 0.004284
113 CORE_COURSE_NAME_2_S_MATH 1101 0.004282
114 CORE_COURSE_NAME_2_F_CHEM 1151L 0.004231
115 HIGH_SCHL_GPA 0.004218
116 STDNT_MAJOR_Psychology 0.004176
117 CORE_COURSE_NAME_1_F_ENGL 1101 0.004170
118 CORE_COURSE_GRADE_2_S_B 0.004121
119 CORE_COURSE_NAME_2_S_CPSC 1105 0.004044
120 CORE_COURSE_NAME_2_F_ENGL 1101 0.003839
121 CORE_COURSE_NAME_1_F_SOCI 1101 0.003808
122 CORE_COURSE_NAME_1_S_ARTH 1100 0.003730
123 CORE_COURSE_GRADE_1_F_A 0.003730
124 STDNT_MAJOR_Computer Science - Games 0.003721
125 CORE_COURSE_NAME_2_S_POLS 1101 0.003704
126 STDNT_MAJOR_General Studies/AS 0.003574
127 CORE_COURSE_NAME_1_S_HIST 1111 0.003538
128 CORE_COURSE_GRADE_1_F_C 0.003339
129 CORE_COURSE_GRADE_2_F_A 0.003285
130 CORE_COURSE_NAME_2_F_THEA 1100 0.003148
131 CORE_COURSE_NAME_1_F_BIOL 1215K 0.002979
132 CORE_COURSE_NAME_1_S_POLS 1101 0.002973
133 CORE_COURSE_NAME_1_S_THEA 1100 0.002966
134 STDNT_MINOR_Music 0.002927
135 CORE_COURSE_NAME_2_S_BIOL 1215K 0.002889
136 CORE_COURSE_NAME_2_S_ENGL 1101 0.002803
137 CORE_COURSE_NAME_1_S_BIOL 1215K 0.002798
138 CORE_COURSE_NAME_1_F_ENVS 1105 0.002669
139 CORE_COURSE_NAME_1_F_ENGL 1102 0.002601
140 CORE_COURSE_NAME_2_S_BIOL 1125 0.002567
141 CORE_COURSE_NAME_2_S_STAT 1127 0.002549
142 CORE_COURSE_NAME_2_F_ENGL 1102 0.002516
143 CORE_COURSE_GRADE_3_F_D 0.002498
144 CORE_COURSE_NAME_1_F_FREN 1001 0.002448
145 CORE_COURSE_NAME_2_F_ASTR 1105 0.002397
146 CORE_COURSE_NAME_1_F_SPAN 1001 0.002394
147 CORE_COURSE_NAME_1_S_SPAN 1002 0.002360
148 STDNT_MAJOR_Communication 0.002342
149 STDNT_MAJOR_Health Science 0.002249
150 STDNT_MAJOR_Early Childhood Education 0.002246
151 CORE_COURSE_NAME_2_S_CHEM 1151 0.002243
152 CORE_COURSE_NAME_3_F_SPAN 1002 0.002237
153 CORE_COURSE_NAME_1_S_CHEM 1151 0.002194
154 STDNT_MAJOR_Sociology 0.002190
155 CORE_COURSE_NAME_2_F_CHEM 1211L 0.002178
156 CORE_COURSE_NAME_1_F_ECON 2105 0.002158
157 CORE_COURSE_NAME_2_S_MATH 1131 0.002139
158 STDNT_MAJOR_Marketing 0.002043
159 CORE_COURSE_NAME_2_F_FREN 1001 0.001921
160 CORE_COURSE_NAME_2_S_ECON 2106 0.001875
161 CORE_COURSE_NAME_1_F_BIOL 1125 0.001839
162 STDNT_MAJOR_Accounting 0.001802
163 CORE_COURSE_NAME_2_F_BIOL 1125 0.001799
164 CORE_COURSE_NAME_1_F_HIST 1112 0.001791
165 STDNT_MAJOR_Mathematics 0.001733
166 CORE_COURSE_NAME_3_F_ENGL 1101I 0.001664
167 CORE_COURSE_NAME_3_F_SOCI 1101 0.001608
168 STDNT_MAJOR_Information Technology 0.001607
169 CORE_COURSE_NAME_2_S_CHEM 1151L 0.001563
170 CORE_COURSE_NAME_3_F_ECON 2105 0.001525
171 CORE_COURSE_NAME_2_F_CPSC 1105 0.001497
172 CORE_COURSE_NAME_1_F_MUSC 1100 0.001414
173 STDNT_MAJOR_Management Information Systems 0.001406
174 STDNT_MINOR_Spanish 0.001406
175 STDNT_MAJOR_Theatre Education 0.001393
176 CORE_COURSE_NAME_1_S_ENGL 1101 0.001330
177 STDNT_MAJOR_Engineering Studies 0.001313
178 CORE_COURSE_NAME_3_F_SPAN 1001 0.001301
179 STDNT_MAJOR_Computer Science - Systems 0.001283
180 CORE_COURSE_NAME_2_S_ANTH 1105 0.001270
181 CORE_COURSE_NAME_1_S_ANTH 1107 0.001236
182 CORE_COURSE_NAME_2_S_SPAN 1002 0.001225
183 CORE_COURSE_NAME_1_S_ANTH 1105 0.001219
184 STDNT_MAJOR_History 0.000962
185 STDNT_MAJOR_Finance 0.000872
186 CORE_COURSE_NAME_2_F_ENGL 1101I 0.000857
In [158]:
import plotly.express as px
xgb_feat1=xgb_feat1.rename(columns={0:'feature importance'})
px.bar(xgb_feat1,x='index',y='feature importance',height=900)

DecisionTreeClassifier

In [159]:
import sklearn.tree as tree
DT=tree.DecisionTreeClassifier(max_depth=3,random_state=200)
DT.fit(X_train,Y_train)
Out[159]:
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=200,
            splitter='best')
In [160]:
DT.score(X_train,Y_train)
Out[160]:
0.8308823529411765
In [161]:
DT.score(X_test,Y_test)
Out[161]:
0.836764705882353
In [162]:
Y_pred = DT.predict(X_test)
In [163]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(Y_test, Y_pred)
print(confusion_matrix)
[[540  11]
 [100  29]]

The result is telling us that we have 540+29 correct predictions and 100+11 incorrect predictions

In [164]:
from sklearn.metrics import classification_report
print(classification_report(Y_test, Y_pred))
             precision    recall  f1-score   support

          0       0.84      0.98      0.91       551
          1       0.72      0.22      0.34       129

avg / total       0.82      0.84      0.80       680

In [165]:
# precision-recall curve and f1
from sklearn.datasets import make_classification
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score
from sklearn.metrics import auc
from matplotlib import pyplot
In [166]:
# generate 2 class dataset
#X, Y = make_classification(n_samples=1000, n_classes=2, random_state=0)
In [167]:
# predict probabilities
DT_probs =DT.predict_proba(X_test)
In [168]:
# keep probabilities for the positive outcome only
DT_probs = DT_probs[:, 1]
In [169]:
# predict class values
yhat = DT.predict(X_test)
DT_precision, DT_recall, _ = precision_recall_curve(Y_test, DT_probs)
DT_f1, DT_auc = f1_score(Y_test, yhat), auc(DT_recall, DT_precision)
In [170]:
# summarize scores
print('Decision Tree: f1=%.3f auc=%.3f' % (DT_f1, DT_auc))
Decision Tree: f1=0.343 auc=0.354
In [171]:
# plot the precision-recall curves
no_skill = len(Y_test[Y_test==1]) / len(Y_test)
pyplot.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill')
pyplot.plot(DT_recall, DT_precision, marker='.', label='Decision Tree')
# axis labels
pyplot.xlabel('Recall')
pyplot.ylabel('Precision')
# show the legend
pyplot.legend()
# show the plot
pyplot.show()
In [172]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
DT_roc_auc = roc_auc_score(Y_test, DT.predict(X_test))
fpr, tpr, thresholds = roc_curve(Y_test, DT.predict_proba(X_test)[:,1])
plt.figure(figsize=(14, 6))
plt.plot(fpr, tpr, label='Decision Tree (area = %0.2f)' % DT_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('DT_ROC')
plt.show()
In [173]:
from sklearn.model_selection import GridSearchCV
mod=GridSearchCV(DT,param_grid={'max_depth':[3,7,8,9]})
mod.fit(X_train,Y_train)
Out[173]:
GridSearchCV(cv=None, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=200,
            splitter='best'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': [3, 7, 8, 9]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score='warn', scoring=None, verbose=0)
In [174]:
mod.best_estimator_
#best depth=3
Out[174]:
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=200,
            splitter='best')
In [175]:
best_feat=pd.DataFrame({'Features':X_train.columns,'Importance':clf.feature_importances_})
In [176]:
best_feat.sort_values('Importance',ascending=0).head(100).reset_index()
Out[176]:
index Features Importance
0 12 SECOND_TERM_EARNED_HRS 0.058057
1 8 HIGH_SCHL_GPA 0.038827
2 11 FIRST_TERM_EARNED_HRS 0.031802
3 5 STDNT_TEST_ENTRANCE_COMB 0.030387
4 7 DISTANCE_FROM_HOME 0.027547
5 436 CORE_COURSE_NAME_2_S_ENGL 1102 0.025250
6 13 COST_OF_ATTEND 0.024766
7 14 EST_FAM_CONTRIBUTION 0.022546
8 409 CORE_COURSE_GRADE_1_S_C 0.017115
9 2 STDNT_BACKGROUND 0.015575
10 9 FATHER_HI_EDU_CD 0.014959
11 10 MOTHER_HI_EDU_CD 0.013357
12 363 CORE_COURSE_NAME_1_S_ENGL 1102 0.013017
13 408 CORE_COURSE_GRADE_1_S_B 0.012340
14 484 CORE_COURSE_GRADE_2_S_B 0.010848
15 0 STDNT_AGE 0.010829
16 1 STDNT_GENDER 0.009626
17 6 HOUSING_STS 0.009195
18 485 CORE_COURSE_GRADE_2_S_C 0.009139
19 282 CORE_COURSE_NAME_3_F_ENGL 1101 0.008752
20 256 CORE_COURSE_GRADE_2_F_B 0.008219
21 336 CORE_COURSE_GRADE_3_F_B 0.007994
22 257 CORE_COURSE_GRADE_2_F_C 0.007544
23 128 CORE_COURSE_NAME_1_F_ENGL 1101 0.007523
24 179 CORE_COURSE_GRADE_1_F_F 0.007513
25 202 CORE_COURSE_NAME_2_F_ENGL 1101 0.007484
26 68 STDNT_MAJOR_Undeclared 0.007225
27 487 CORE_COURSE_GRADE_2_S_F 0.007073
28 483 CORE_COURSE_GRADE_2_S_A 0.006978
29 335 CORE_COURSE_GRADE_3_F_A 0.006731
30 177 CORE_COURSE_GRADE_1_F_C 0.006723
31 19 STDNT_MAJOR_Biology 0.006601
32 176 CORE_COURSE_GRADE_1_F_B 0.006537
33 178 CORE_COURSE_GRADE_1_F_D 0.006434
34 410 CORE_COURSE_GRADE_1_S_D 0.006284
35 411 CORE_COURSE_GRADE_1_S_F 0.005942
36 259 CORE_COURSE_GRADE_2_F_F 0.005782
37 337 CORE_COURSE_GRADE_3_F_C 0.005500
38 255 CORE_COURSE_GRADE_2_F_A 0.005149
39 3 IN_STATE_FLAG 0.005091
40 144 CORE_COURSE_NAME_1_F_HIST 2111 0.004997
41 234 CORE_COURSE_NAME_2_F_MATH 1111 0.004754
42 407 CORE_COURSE_GRADE_1_S_A 0.004634
43 245 CORE_COURSE_NAME_2_F_POLS 1101 0.004624
44 194 CORE_COURSE_NAME_2_F_COMM 1110 0.004572
45 120 CORE_COURSE_NAME_1_F_COMM 1110 0.004552
46 60 STDNT_MAJOR_Pre-Nursing 0.004454
47 258 CORE_COURSE_GRADE_2_F_D 0.004384
48 298 CORE_COURSE_NAME_3_F_HIST 2111 0.004302
49 61 STDNT_MAJOR_Psychology 0.004271
50 486 CORE_COURSE_GRADE_2_S_D 0.004225
51 220 CORE_COURSE_NAME_2_F_HIST 2111 0.004178
52 145 CORE_COURSE_NAME_1_F_HIST 2112 0.004152
53 156 CORE_COURSE_NAME_1_F_MATH 1111 0.004061
54 110 CORE_COURSE_NAME_1_F_ARTH 1100 0.003966
55 56 STDNT_MAJOR_Nursing 0.003929
56 353 CORE_COURSE_NAME_1_S_COMM 1110 0.003903
57 127 CORE_COURSE_NAME_1_F_ECON 2106 0.003832
58 452 CORE_COURSE_NAME_2_S_HIST 2112 0.003759
59 28 STDNT_MAJOR_Early Childhood Education 0.003716
60 451 CORE_COURSE_NAME_2_S_HIST 2111 0.003685
61 58 STDNT_MAJOR_Pre-Business 0.003675
62 175 CORE_COURSE_GRADE_1_F_A 0.003640
63 477 CORE_COURSE_NAME_2_S_PSYC 1101 0.003574
64 314 CORE_COURSE_NAME_3_F_MATH 1111 0.003565
65 398 CORE_COURSE_NAME_1_S_POLS 1101 0.003559
66 118 CORE_COURSE_NAME_1_F_CHEM 1211 0.003540
67 359 CORE_COURSE_NAME_1_S_ECON 2105 0.003533
68 379 CORE_COURSE_NAME_1_S_HIST 2112 0.003433
69 180 CORE_COURSE_GRADE_1_F_NOT REP 0.003424
70 389 CORE_COURSE_NAME_1_S_MATH 1111 0.003415
71 241 CORE_COURSE_NAME_2_F_MUSC 1100 0.003412
72 426 CORE_COURSE_NAME_2_S_COMM 1110 0.003394
73 116 CORE_COURSE_NAME_1_F_CHEM 1151 0.003385
74 465 CORE_COURSE_NAME_2_S_MATH 1111 0.003380
75 168 CORE_COURSE_NAME_1_F_PSYC 1101 0.003376
76 271 CORE_COURSE_NAME_3_F_CHEM 1151L 0.003362
77 344 CORE_COURSE_NAME_1_S_ARTH 1100 0.003354
78 97 STDNT_MINOR_N 0.003323
79 378 CORE_COURSE_NAME_1_S_HIST 2111 0.003321
80 263 CORE_COURSE_NAME_3_F_ARTH 1100 0.003318
81 361 CORE_COURSE_NAME_1_S_ENGL 1101 0.003190
82 26 STDNT_MAJOR_Criminal Justice 0.003147
83 434 CORE_COURSE_NAME_2_S_ENGL 1101 0.003125
84 321 CORE_COURSE_NAME_3_F_MUSC 1100 0.003124
85 474 CORE_COURSE_NAME_2_S_POLS 1101 0.003000
86 165 CORE_COURSE_NAME_1_F_POLS 1101 0.002968
87 401 CORE_COURSE_NAME_1_S_PSYC 1101 0.002821
88 274 CORE_COURSE_NAME_3_F_COMM 1110 0.002780
89 221 CORE_COURSE_NAME_2_F_HIST 2112 0.002749
90 376 CORE_COURSE_NAME_1_S_HIST 1112 0.002736
91 313 CORE_COURSE_NAME_3_F_MATH 1101 0.002670
92 325 CORE_COURSE_NAME_3_F_POLS 1101 0.002633
93 233 CORE_COURSE_NAME_2_F_MATH 1101 0.002573
94 270 CORE_COURSE_NAME_3_F_CHEM 1151 0.002560
95 17 STDNT_MAJOR_Art 0.002502
96 54 STDNT_MAJOR_Music Education 0.002472
97 190 CORE_COURSE_NAME_2_F_CHEM 1151 0.002467
98 184 CORE_COURSE_NAME_2_F_ARTH 1100 0.002454
99 330 CORE_COURSE_NAME_3_F_SPAN 1001 0.002421